Package rdkit :: Package ML :: Package InfoTheory :: Module BitRank
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.InfoTheory.BitRank

  1  # 
  2  #  Copyright (C) 2001,2002,2003  greg Landrum and Rational Discovery LLC 
  3  # 
  4  """ Functionality for ranking bits using info gains 
  5   
  6   **Definitions used in this module** 
  7   
  8      - *sequence*: an object capable of containing other objects which supports 
  9        __getitem__() and __len__().  Examples of these include lists, tuples, and 
 10        Numeric arrays. 
 11   
 12      - *IntVector*: an object containing integers which supports __getitem__() and 
 13         __len__(). Examples include lists, tuples, Numeric Arrays, and BitVects. 
 14   
 15   
 16   **NOTE**: Neither *sequences* nor *IntVectors* need to support item assignment. 
 17     It is perfectly acceptable for them to be read-only, so long as they are 
 18     random-access. 
 19   
 20  """ 
 21  import numpy 
 22   
 23  from rdkit.ML.InfoTheory import entropy 
 24   
 25   
26 -def FormCounts(bitVects, actVals, whichBit, nPossibleActs, nPossibleBitVals=2):
27 """ generates the counts matrix for a particular bit 28 29 **Arguments** 30 31 - bitVects: a *sequence* containing *IntVectors* 32 33 - actVals: a *sequence* 34 35 - whichBit: an integer, the bit number to use. 36 37 - nPossibleActs: the (integer) number of possible activity values. 38 39 - nPossibleBitVals: (optional) if specified, this integer provides the maximum 40 value attainable by the (increasingly inaccurately named) bits in _bitVects_. 41 42 **Returns** 43 44 a Numeric array with the counts 45 46 **Notes** 47 48 This is really intended for internal use. 49 50 """ 51 if len(bitVects) != len(actVals): 52 raise ValueError('var and activity lists should be the same length') 53 res = numpy.zeros((nPossibleBitVals, nPossibleActs), numpy.integer) 54 for i in range(len(bitVects)): 55 res[bitVects[i][whichBit], actVals[i]] += 1 56 return res
57 58
59 -def CalcInfoGains(bitVects, actVals, nPossibleActs, nPossibleBitVals=2):
60 """ Calculates the information gain for a set of points and activity values 61 62 **Arguments** 63 64 - bitVects: a *sequence* containing *IntVectors* 65 66 - actVals: a *sequence* 67 68 - nPossibleActs: the (integer) number of possible activity values. 69 70 - nPossibleBitVals: (optional) if specified, this integer provides the maximum 71 value attainable by the (increasingly inaccurately named) bits in _bitVects_. 72 73 **Returns** 74 75 a list of floats 76 77 """ 78 if len(bitVects) != len(actVals): 79 raise ValueError('var and activity lists should be the same length') 80 nBits = len(bitVects[0]) 81 res = numpy.zeros(nBits, numpy.float) 82 83 for bit in range(nBits): 84 counts = FormCounts(bitVects, actVals, bit, nPossibleActs, nPossibleBitVals=nPossibleBitVals) 85 res[bit] = entropy.InfoGain(counts) 86 return res
87 88
89 -def RankBits(bitVects, actVals, nPossibleBitVals=2, metricFunc=CalcInfoGains):
90 """ Rank a set of bits according to a metric function 91 92 **Arguments** 93 94 - bitVects: a *sequence* containing *IntVectors* 95 96 - actVals: a *sequence* 97 98 - nPossibleBitVals: (optional) if specified, this integer provides the maximum 99 value attainable by the (increasingly inaccurately named) bits in _bitVects_. 100 101 - metricFunc: (optional) the metric function to be used. See _CalcInfoGains()_ 102 for a description of the signature of this function. 103 104 **Returns** 105 106 A 2-tuple containing: 107 108 - the relative order of the bits (a list of ints) 109 110 - the metric calculated for each bit (a list of floats) 111 112 """ 113 nPossibleActs = max(actVals) + 1 114 metrics = metricFunc(bitVects, actVals, nPossibleActs, nPossibleBitVals=nPossibleBitVals) 115 bitOrder = list(numpy.argsort(metrics)) 116 bitOrder.reverse() 117 return bitOrder, metrics
118 119
120 -def AnalyzeSparseVects(bitVects, actVals):
121 """ #DOC 122 123 **Arguments** 124 125 - bitVects: a *sequence* containing SBVs 126 127 - actVals: a *sequence* 128 129 **Returns** 130 131 a list of floats 132 133 **Notes** 134 135 - these need to be bit vects and binary activities 136 137 """ 138 nPts = len(bitVects) 139 if nPts != len(actVals): 140 raise ValueError('var and activity lists should be the same length') 141 nBits = bitVects[0].GetSize() 142 143 actives = numpy.zeros(nBits, numpy.integer) 144 inactives = numpy.zeros(nBits, numpy.integer) 145 nActives, nInactives = 0, 0 146 for i in range(nPts): 147 sig, act = bitVects[i], actVals[i] 148 onBitList = sig.GetOnBits() 149 if act: 150 for bit in onBitList: 151 actives[bit] += 1 152 nActives += 1 153 else: 154 for bit in onBitList: 155 inactives[bit] += 1 156 nInactives += 1 157 resTbl = numpy.zeros((2, 2), numpy.integer) 158 res = [] 159 gains = [] 160 for bit in range(nBits): 161 nAct, nInact = actives[bit], inactives[bit] 162 if nAct or nInact: 163 resTbl[0, 0] = nAct 164 resTbl[1, 0] = nPts - nAct 165 resTbl[0, 1] = nInact 166 resTbl[1, 1] = nPts - nInact 167 gain = entropy.InfoGain(resTbl) 168 gains.append(gain) 169 res.append((bit, gain, nAct, nInact)) 170 return res, gains
171 172
173 -def SparseRankBits(bitVects, actVals, metricFunc=AnalyzeSparseVects):
174 """ Rank a set of bits according to a metric function 175 176 **Arguments** 177 178 - bitVects: a *sequence* containing SBVs 179 180 - actVals: a *sequence* 181 182 - metricFunc: (optional) the metric function to be used. See _SparseCalcInfoGains()_ 183 for a description of the signature of this function. 184 185 **Returns** 186 187 A 2-tuple containing: 188 189 - the relative order of the bits (a list of ints) 190 191 - the metric calculated for each bit (a list of floats) 192 193 **Notes** 194 195 - these need to be bit vects and binary activities 196 197 """ 198 info, metrics = metricFunc(bitVects, actVals) 199 bitOrder = list(numpy.argsort(metrics)) 200 bitOrder.reverse() 201 return bitOrder, info
202