1
2
3
4 """ Functionality for ranking bits using info gains
5
6 **Definitions used in this module**
7
8 - *sequence*: an object capable of containing other objects which supports
9 __getitem__() and __len__(). Examples of these include lists, tuples, and
10 Numeric arrays.
11
12 - *IntVector*: an object containing integers which supports __getitem__() and
13 __len__(). Examples include lists, tuples, Numeric Arrays, and BitVects.
14
15
16 **NOTE**: Neither *sequences* nor *IntVectors* need to support item assignment.
17 It is perfectly acceptable for them to be read-only, so long as they are
18 random-access.
19
20 """
21 import numpy
22
23 from rdkit.ML.InfoTheory import entropy
24
25
57
58
59 -def CalcInfoGains(bitVects, actVals, nPossibleActs, nPossibleBitVals=2):
60 """ Calculates the information gain for a set of points and activity values
61
62 **Arguments**
63
64 - bitVects: a *sequence* containing *IntVectors*
65
66 - actVals: a *sequence*
67
68 - nPossibleActs: the (integer) number of possible activity values.
69
70 - nPossibleBitVals: (optional) if specified, this integer provides the maximum
71 value attainable by the (increasingly inaccurately named) bits in _bitVects_.
72
73 **Returns**
74
75 a list of floats
76
77 """
78 if len(bitVects) != len(actVals):
79 raise ValueError('var and activity lists should be the same length')
80 nBits = len(bitVects[0])
81 res = numpy.zeros(nBits, numpy.float)
82
83 for bit in range(nBits):
84 counts = FormCounts(bitVects, actVals, bit, nPossibleActs, nPossibleBitVals=nPossibleBitVals)
85 res[bit] = entropy.InfoGain(counts)
86 return res
87
88
90 """ Rank a set of bits according to a metric function
91
92 **Arguments**
93
94 - bitVects: a *sequence* containing *IntVectors*
95
96 - actVals: a *sequence*
97
98 - nPossibleBitVals: (optional) if specified, this integer provides the maximum
99 value attainable by the (increasingly inaccurately named) bits in _bitVects_.
100
101 - metricFunc: (optional) the metric function to be used. See _CalcInfoGains()_
102 for a description of the signature of this function.
103
104 **Returns**
105
106 A 2-tuple containing:
107
108 - the relative order of the bits (a list of ints)
109
110 - the metric calculated for each bit (a list of floats)
111
112 """
113 nPossibleActs = max(actVals) + 1
114 metrics = metricFunc(bitVects, actVals, nPossibleActs, nPossibleBitVals=nPossibleBitVals)
115 bitOrder = list(numpy.argsort(metrics))
116 bitOrder.reverse()
117 return bitOrder, metrics
118
119
121 """ #DOC
122
123 **Arguments**
124
125 - bitVects: a *sequence* containing SBVs
126
127 - actVals: a *sequence*
128
129 **Returns**
130
131 a list of floats
132
133 **Notes**
134
135 - these need to be bit vects and binary activities
136
137 """
138 nPts = len(bitVects)
139 if nPts != len(actVals):
140 raise ValueError('var and activity lists should be the same length')
141 nBits = bitVects[0].GetSize()
142
143 actives = numpy.zeros(nBits, numpy.integer)
144 inactives = numpy.zeros(nBits, numpy.integer)
145 nActives, nInactives = 0, 0
146 for i in range(nPts):
147 sig, act = bitVects[i], actVals[i]
148 onBitList = sig.GetOnBits()
149 if act:
150 for bit in onBitList:
151 actives[bit] += 1
152 nActives += 1
153 else:
154 for bit in onBitList:
155 inactives[bit] += 1
156 nInactives += 1
157 resTbl = numpy.zeros((2, 2), numpy.integer)
158 res = []
159 gains = []
160 for bit in range(nBits):
161 nAct, nInact = actives[bit], inactives[bit]
162 if nAct or nInact:
163 resTbl[0, 0] = nAct
164 resTbl[1, 0] = nPts - nAct
165 resTbl[0, 1] = nInact
166 resTbl[1, 1] = nPts - nInact
167 gain = entropy.InfoGain(resTbl)
168 gains.append(gain)
169 res.append((bit, gain, nAct, nInact))
170 return res, gains
171
172
174 """ Rank a set of bits according to a metric function
175
176 **Arguments**
177
178 - bitVects: a *sequence* containing SBVs
179
180 - actVals: a *sequence*
181
182 - metricFunc: (optional) the metric function to be used. See _SparseCalcInfoGains()_
183 for a description of the signature of this function.
184
185 **Returns**
186
187 A 2-tuple containing:
188
189 - the relative order of the bits (a list of ints)
190
191 - the metric calculated for each bit (a list of floats)
192
193 **Notes**
194
195 - these need to be bit vects and binary activities
196
197 """
198 info, metrics = metricFunc(bitVects, actVals)
199 bitOrder = list(numpy.argsort(metrics))
200 bitOrder.reverse()
201 return bitOrder, info
202