Package rdkit :: Package ML :: Package DecTree :: Module ID3
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.DecTree.ID3

  1  # 
  2  #  Copyright (C) 2000-2008  greg Landrum and Rational Discovery LLC 
  3  # 
  4  """ ID3 Decision Trees 
  5   
  6    contains an implementation of the ID3 decision tree algorithm 
  7    as described in Tom Mitchell's book "Machine Learning" 
  8   
  9    It relies upon the _Tree.TreeNode_ data structure (or something 
 10      with the same API) defined locally to represent the trees 
 11   
 12  """ 
 13   
 14  import numpy 
 15   
 16  from rdkit.ML.DecTree import DecTree 
 17  from rdkit.ML.InfoTheory import entropy 
 18   
 19   
20 -def CalcTotalEntropy(examples, nPossibleVals):
21 """ Calculates the total entropy of the data set (w.r.t. the results) 22 23 **Arguments** 24 25 - examples: a list (nInstances long) of lists of variable values + instance 26 values 27 - nPossibleVals: a list (nVars long) of the number of possible values each variable 28 can adopt. 29 30 **Returns** 31 32 a float containing the informational entropy of the data set. 33 34 """ 35 nRes = nPossibleVals[-1] 36 resList = numpy.zeros(nRes, 'i') 37 for example in examples: 38 res = int(example[-1]) 39 resList[res] += 1 40 return entropy.InfoEntropy(resList)
41 42
43 -def GenVarTable(examples, nPossibleVals, vars):
44 """Generates a list of variable tables for the examples passed in. 45 46 The table for a given variable records the number of times each possible value 47 of that variable appears for each possible result of the function. 48 49 **Arguments** 50 51 - examples: a list (nInstances long) of lists of variable values + instance 52 values 53 54 - nPossibleVals: a list containing the number of possible values of 55 each variable + the number of values of the function. 56 57 - vars: a list of the variables to include in the var table 58 59 60 **Returns** 61 62 a list of variable result tables. Each table is a Numeric array 63 which is varValues x nResults 64 """ 65 nVars = len(vars) 66 res = [None] * nVars 67 nFuncVals = nPossibleVals[-1] 68 69 for i in range(nVars): 70 res[i] = numpy.zeros((nPossibleVals[vars[i]], nFuncVals), 'i') 71 for example in examples: 72 val = int(example[-1]) 73 for i in range(nVars): 74 res[i][int(example[vars[i]]), val] += 1 75 76 return res
77 78
79 -def ID3(examples, target, attrs, nPossibleVals, depth=0, maxDepth=-1, **kwargs):
80 """ Implements the ID3 algorithm for constructing decision trees. 81 82 From Mitchell's book, page 56 83 84 This is *slightly* modified from Mitchell's book because it supports 85 multivalued (non-binary) results. 86 87 **Arguments** 88 89 - examples: a list (nInstances long) of lists of variable values + instance 90 values 91 92 - target: an int 93 94 - attrs: a list of ints indicating which variables can be used in the tree 95 96 - nPossibleVals: a list containing the number of possible values of 97 every variable. 98 99 - depth: (optional) the current depth in the tree 100 101 - maxDepth: (optional) the maximum depth to which the tree 102 will be grown 103 104 **Returns** 105 106 a DecTree.DecTreeNode with the decision tree 107 108 **NOTE:** This code cannot bootstrap (start from nothing...) 109 use _ID3Boot_ (below) for that. 110 """ 111 varTable = GenVarTable(examples, nPossibleVals, attrs) 112 tree = DecTree.DecTreeNode(None, 'node') 113 114 # store the total entropy... in case that is interesting 115 totEntropy = CalcTotalEntropy(examples, nPossibleVals) 116 tree.SetData(totEntropy) 117 # tree.SetExamples(examples) 118 119 # the matrix of results for this target: 120 tMat = GenVarTable(examples, nPossibleVals, [target])[0] 121 # counts of each result code: 122 counts = sum(tMat) 123 nzCounts = numpy.nonzero(counts)[0] 124 125 if len(nzCounts) == 1: 126 # bottomed out because there is only one result code left 127 # with any counts (i.e. there's only one type of example 128 # left... this is GOOD!). 129 res = nzCounts[0] 130 tree.SetLabel(res) 131 tree.SetName(str(res)) 132 tree.SetTerminal(1) 133 elif len(attrs) == 0 or (maxDepth >= 0 and depth >= maxDepth): 134 # Bottomed out: no variables left or max depth hit 135 # We don't really know what to do here, so 136 # use the heuristic of picking the most prevalent 137 # result 138 v = numpy.argmax(counts) 139 tree.SetLabel(v) 140 tree.SetName('%d?' % v) 141 tree.SetTerminal(1) 142 else: 143 # find the variable which gives us the largest information gain 144 145 gains = [entropy.InfoGain(x) for x in varTable] 146 best = attrs[numpy.argmax(gains)] 147 148 # remove that variable from the lists of possible variables 149 nextAttrs = attrs[:] 150 if not kwargs.get('recycleVars', 0): 151 nextAttrs.remove(best) 152 153 # set some info at this node 154 tree.SetName('Var: %d' % best) 155 tree.SetLabel(best) 156 # tree.SetExamples(examples) 157 tree.SetTerminal(0) 158 159 # loop over possible values of the new variable and 160 # build a subtree for each one 161 for val in range(nPossibleVals[best]): 162 nextExamples = [] 163 for example in examples: 164 if example[best] == val: 165 nextExamples.append(example) 166 if len(nextExamples) == 0: 167 # this particular value of the variable has no examples, 168 # so there's not much sense in recursing. 169 # This can (and does) happen. 170 v = numpy.argmax(counts) 171 tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1) 172 else: 173 # recurse 174 tree.AddChildNode( 175 ID3(nextExamples, best, nextAttrs, nPossibleVals, depth + 1, maxDepth, **kwargs)) 176 return tree
177 178
179 -def ID3Boot(examples, attrs, nPossibleVals, initialVar=None, depth=0, maxDepth=-1, **kwargs):
180 """ Bootstrapping code for the ID3 algorithm 181 182 see ID3 for descriptions of the arguments 183 184 If _initialVar_ is not set, the algorithm will automatically 185 choose the first variable in the tree (the standard greedy 186 approach). Otherwise, _initialVar_ will be used as the first 187 split. 188 189 """ 190 totEntropy = CalcTotalEntropy(examples, nPossibleVals) 191 varTable = GenVarTable(examples, nPossibleVals, attrs) 192 193 tree = DecTree.DecTreeNode(None, 'node') 194 # tree.SetExamples(examples) 195 tree._nResultCodes = nPossibleVals[-1] 196 197 # <perl>you've got to love any language which will let you 198 # do this much work in a single line :-)</perl> 199 if initialVar is None: 200 best = attrs[numpy.argmax([entropy.InfoGain(x) for x in varTable])] 201 else: 202 best = initialVar 203 204 tree.SetName('Var: %d' % best) 205 tree.SetData(totEntropy) 206 tree.SetLabel(best) 207 tree.SetTerminal(0) 208 nextAttrs = list(attrs) 209 if not kwargs.get('recycleVars', 0): 210 nextAttrs.remove(best) 211 212 for val in range(nPossibleVals[best]): 213 nextExamples = [] 214 for example in examples: 215 if example[best] == val: 216 nextExamples.append(example) 217 218 tree.AddChildNode(ID3(nextExamples, best, nextAttrs, nPossibleVals, depth, maxDepth, **kwargs)) 219 return tree
220