Package rdkit :: Package ML :: Package Descriptors :: Module CompoundDescriptors
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Descriptors.CompoundDescriptors

  1  # 
  2  #  Copyright (C) 2001,2002  greg Landrum and Rational Discovery LLC 
  3  # 
  4  """ descriptor calculator for compounds defined by a composition alone 
  5    (only the composition is required) 
  6   
  7  """ 
  8  from __future__ import print_function 
  9   
 10  from rdkit import RDConfig 
 11  from rdkit.ML.Descriptors import Parser, Descriptors 
 12  from rdkit.utils import chemutils 
 13   
 14  # the list of possible ways to count valence electrons that we know 
 15  countOptions = [('NVAL', 'total number of valence electrons'), 
 16                  ('NVAL_NO_FULL_F', 'number of valence electrons neglecting filled f shells'), 
 17                  ('NVAL_NO_FULL_D', 'number of valence electrons neglecting filled d shells'), 
 18                  ('NVAL_NO_FULL', 'number of valence electrons neglecting filled f and d shells')] 
 19   
 20   
21 -def GetAllDescriptorNames(db, tbl1, tbl2, user='sysdba', password='masterkey'):
22 """ gets possible descriptor names from a database 23 24 **Arguments** 25 26 - db: the name of the database to use 27 28 - tbl1: the name of the table to be used for reading descriptor values 29 30 - tbl2: the name of the table to be used for reading notes about the 31 descriptors (*descriptions of the descriptors if you like*) 32 33 - user: the user name for DB access 34 35 - password: the password for DB access 36 37 **Returns** 38 39 a 2-tuple containing: 40 41 1) a list of column names 42 43 2) a list of column descriptors 44 45 **Notes** 46 47 - this uses _Dbase.DbInfo_ and Dfunctionality for querying the database 48 49 - it is assumed that tbl2 includes 'property' and 'notes' columns 50 51 """ 52 from rdkit.Dbase.DbConnection import DbConnect 53 conn = DbConnect(db, user=user, password=password) 54 55 colNames = conn.GetColumnNames(table=tbl1) 56 colDesc = map(lambda x: (x[0].upper(), x[1]), conn.GetColumns('property,notes', table=tbl2)) 57 for name, desc in countOptions: 58 colNames.append(name) 59 colDesc.append((name, desc)) 60 return colNames, colDesc
61 62
63 -class CompoundDescriptorCalculator(Descriptors.DescriptorCalculator):
64 """ used for calculating descriptors 65 66 This is the central point for descriptor calculation 67 68 **Notes** 69 70 - There are two kinds of descriptors this cares about: 71 72 1) *Simple Descriptors* can be calculated solely using atomic descriptor 73 values and the composition of the compound. The full list of possible 74 simple descriptors is determined by the types of *Calculator Methods* 75 (see below) and the contents of an atomic database. 76 77 Simple Descriptors can be marked as *nonZeroDescriptors*. These are used 78 to winnow out atom types where particular atomic descriptors are zero 79 (usually indicating that the value is unknown) 80 81 Simple Descriptors are maintained locally in the _simpleList_ 82 83 2) *Compound Descriptors* may rely upon more complicated computation schemes 84 and descriptors for the compound as a whole (e.g. structural variables, etc.). 85 The full list of compound descriptors is limitless. They are calculated using 86 the _ML.Descriptors.Parser_ module. 87 88 Compound Descriptors are maintained locally in the _compoundList_ 89 90 - This class has a some special methods which are labelled as *Calculator Method* 91 These are used internally to take atomic descriptors and reduce them to a single 92 simple descriptor value for a composition. They are primarily intended for internal use. 93 94 - a *composition vector* is a list of 2-tuples: '[(atom1name,atom1Num),...]' 95 where atom1Num is the contribution of the atom to the stoichiometry of the 96 compound. No assumption is made about the stoichiometries (i.e. they don't 97 have to be either integral or all sum to one). 98 99 """ 100 101 # ------------ 102 # methods used to calculate descriptors 103 # ------------ 104
105 - def SUM(self, desc, compos):
106 """ *Calculator Method* 107 108 sums the descriptor values across the composition 109 110 **Arguments** 111 112 - desc: the name of the descriptor 113 114 - compos: the composition vector 115 116 **Returns** 117 118 a float 119 120 """ 121 res = 0.0 122 for atom, num in compos: 123 res = res + self.atomDict[atom][desc] * num 124 return res
125
126 - def MEAN(self, desc, compos):
127 """ *Calculator Method* 128 129 averages the descriptor values across the composition 130 131 **Arguments** 132 133 - desc: the name of the descriptor 134 135 - compos: the composition vector 136 137 **Returns** 138 139 a float 140 141 """ 142 res = 0.0 143 nSoFar = 0.0 144 for atom, num in compos: 145 res = res + self.atomDict[atom][desc] * num 146 nSoFar = nSoFar + num 147 return res / nSoFar
148
149 - def DEV(self, desc, compos):
150 """ *Calculator Method* 151 152 average deviation of the descriptor values across the composition 153 154 **Arguments** 155 156 - desc: the name of the descriptor 157 158 - compos: the composition vector 159 160 **Returns** 161 162 a float 163 164 """ 165 mean = self.MEAN(desc, compos) 166 res = 0.0 167 nSoFar = 0.0 168 for atom, num in compos: 169 res = res + abs(self.atomDict[atom][desc] - mean) * num 170 nSoFar = nSoFar + num 171 return res / nSoFar
172
173 - def MIN(self, desc, compos):
174 """ *Calculator Method* 175 176 minimum of the descriptor values across the composition 177 178 **Arguments** 179 180 - desc: the name of the descriptor 181 182 - compos: the composition vector 183 184 **Returns** 185 186 a float 187 188 """ 189 return min(map(lambda x, y=desc, z=self: z.atomDict[x[0]][y], compos))
190
191 - def MAX(self, desc, compos):
192 """ *Calculator Method* 193 194 maximum of the descriptor values across the composition 195 196 **Arguments** 197 198 - desc: the name of the descriptor 199 200 - compos: the composition vector 201 202 **Returns** 203 204 a float 205 206 """ 207 return max(map(lambda x, y=desc, z=self: z.atomDict[x[0]][y], compos))
208 209 # ------------ 210 # Other methods 211 # ------------ 212
213 - def ProcessSimpleList(self):
214 """ Handles the list of simple descriptors 215 216 This constructs the list of _nonZeroDescriptors_ and _requiredDescriptors_. 217 218 There's some other magic going on that I can't decipher at the moment. 219 220 """ 221 global countOptions 222 223 self.nonZeroDescriptors = [] 224 lCopy = self.simpleList[:] 225 tList = map(lambda x: x[0], countOptions) 226 for entry in lCopy: 227 if 'NONZERO' in entry[1]: 228 if entry[0] not in tList: 229 self.nonZeroDescriptors.append('%s != 0' % entry[0]) 230 if len(entry[1]) == 1: 231 self.simpleList.remove(entry) 232 else: 233 self.simpleList[self.simpleList.index(entry)][1].remove('NONZERO') 234 self.requiredDescriptors = map(lambda x: x[0], self.simpleList) 235 for entry in tList: 236 if entry in self.requiredDescriptors: 237 self.requiredDescriptors.remove(entry)
238
239 - def ProcessCompoundList(self):
240 """ Adds entries from the _compoundList_ to the list of _requiredDescriptors_ 241 242 Each compound descriptor is surveyed. Any atomic descriptors it requires 243 are added to the list of _requiredDescriptors_ to be pulled from the database. 244 245 """ 246 # add in the atomic descriptors we will need 247 for entry in self.compoundList: 248 for atomicDesc in entry[1]: 249 if atomicDesc != '' and atomicDesc not in self.requiredDescriptors: 250 self.requiredDescriptors.append(atomicDesc)
251
252 - def BuildAtomDict(self):
253 """ builds the local atomic dict 254 255 We don't want to keep around all descriptor values for all atoms, so this 256 method takes care of only pulling out the descriptors in which we are 257 interested. 258 259 **Notes** 260 261 - this uses _chemutils.GetAtomicData_ to actually pull the data 262 263 """ 264 self.ProcessSimpleList() 265 self.ProcessCompoundList() 266 267 self.atomDict = {} 268 whereString = ' and '.join(self.nonZeroDescriptors) 269 if whereString != '': 270 whereString = 'where ' + whereString 271 chemutils.GetAtomicData(self.atomDict, self.requiredDescriptors, self.dbName, self.dbTable, 272 whereString, self.dbUser, self.dbPassword, includeElCounts=1)
273
274 - def CalcSimpleDescriptorsForComposition(self, compos='', composList=None):
275 """ calculates all simple descriptors for a given composition 276 277 **Arguments** 278 279 - compos: a string representation of the composition 280 281 - composList: a *composVect* 282 283 The client must provide either _compos_ or _composList_. If both are 284 provided, _composList_ takes priority. 285 286 **Returns** 287 the list of descriptor values 288 289 **Notes** 290 291 - when _compos_ is provided, this uses _chemutils.SplitComposition_ 292 to split the composition into its individual pieces 293 294 - if problems are encountered because of either an unknown descriptor or 295 atom type, a _KeyError_ will be raised. 296 297 """ 298 if composList is None: 299 composList = chemutils.SplitComposition(compos) 300 try: 301 res = [] 302 for descName, targets in self.simpleList: 303 for target in targets: 304 try: 305 method = getattr(self, target) 306 except AttributeError: 307 print('Method %s does not exist' % (target)) 308 else: 309 res.append(method(descName, composList)) 310 except KeyError as msg: 311 print('composition %s caused problems' % composList) 312 raise KeyError(msg) 313 return res
314
315 - def CalcCompoundDescriptorsForComposition(self, compos='', composList=None, propDict={}):
316 """ calculates all simple descriptors for a given composition 317 318 **Arguments** 319 320 - compos: a string representation of the composition 321 322 - composList: a *composVect* 323 324 - propDict: a dictionary containing the properties of the composition 325 as a whole (e.g. structural variables, etc.) 326 327 The client must provide either _compos_ or _composList_. If both are 328 provided, _composList_ takes priority. 329 330 **Returns** 331 the list of descriptor values 332 333 **Notes** 334 335 - when _compos_ is provided, this uses _chemutils.SplitComposition_ 336 to split the composition into its individual pieces 337 338 """ 339 if composList is None: 340 composList = chemutils.SplitComposition(compos) 341 res = [] 342 for cl in self.compoundList: 343 val = Parser.CalcSingleCompoundDescriptor(composList, cl[1:], self.atomDict, propDict) 344 res.append(val) 345 return res
346
347 - def CalcDescriptorsForComposition(self, composVect, propDict):
348 """ calculates all descriptors for a given composition 349 350 **Arguments** 351 352 - compos: a string representation of the composition 353 354 - propDict: a dictionary containing the properties of the composition 355 as a whole (e.g. structural variables, etc.). These are used to 356 generate Compound Descriptors 357 358 **Returns** 359 the list of all descriptor values 360 361 **Notes** 362 363 - this uses _chemutils.SplitComposition_ 364 to split the composition into its individual pieces 365 366 """ 367 composList = chemutils.SplitComposition(composVect[0]) 368 try: 369 r1 = self.CalcSimpleDescriptorsForComposition(composList=composList) 370 except KeyError: 371 res = [] 372 else: 373 r2 = self.CalcCompoundDescriptorsForComposition(composList=composList, propDict=propDict) 374 res = r1 + r2 375 376 return tuple(res)
377 378 CalcDescriptors = CalcDescriptorsForComposition 379
380 - def GetDescriptorNames(self):
381 """ returns a list of the names of the descriptors this calculator generates 382 383 """ 384 if self.descriptorNames is not None: 385 return self.descriptorNames 386 else: 387 res = [] 388 for descName, targets in self.simpleList: 389 for target in targets: 390 if hasattr(self, target): 391 res.append('%s_%s' % (target, descName)) 392 else: 393 print('Method %s does not exist' % (target)) 394 for entry in self.compoundList: 395 res.append(entry[0]) 396 self.descriptorNames = res[:] 397 return tuple(res)
398
399 - def __init__(self, simpleList, compoundList=None, dbName=None, dbTable='atomic_data', 400 dbUser='sysdba', dbPassword='masterkey'):
401 """ Constructor 402 403 **Arguments** 404 405 - simpleList: list of simple descriptors to be calculated 406 (see below for format) 407 408 - compoundList: list of compound descriptors to be calculated 409 (see below for format) 410 411 - dbName: name of the atomic database to be used 412 413 - dbTable: name the table in _dbName_ which has atomic data 414 415 - dbUser: user name for DB access 416 417 - dbPassword: password for DB access 418 419 **Note** 420 421 - format of simpleList: 422 a list of 2-tuples containing: 423 424 1) name of the atomic descriptor 425 426 2) a list of operations on that descriptor (e.g. NonZero, Max, etc.) 427 These must correspond to the *Calculator Method* names above. 428 429 - format of compoundList: 430 a list of 2-tuples containing: 431 432 1) name of the descriptor to be calculated 433 434 2) list of selected atomic descriptor names (define $1, $2, etc.) 435 436 3) list of selected compound descriptor names (define $a, $b, etc.) 437 438 4) text formula defining the calculation (see _Parser_) 439 440 """ 441 442 if dbName is None: 443 dbName = RDConfig.RDDataDatabase 444 445 Descriptors.DescriptorCalculator.__init__(self) 446 self.simpleList = [(x[0].upper(), [y.upper() for y in x[1]]) for x in simpleList] 447 self.descriptorNames = None 448 self.compoundList = compoundList 449 if self.compoundList is None: 450 self.compoundList = [] 451 self.dbName = dbName 452 self.dbTable = dbTable 453 self.dbUser = dbUser 454 self.dbPassword = dbPassword
455 456
457 -def _exampleCode():
458 d = [('DED', ['NonZero', 'Mean', 'Dev']), ('M_B_electroneg', ['NonZero']), 459 ('Cov_rad', ['Max', 'Min'])] 460 o = CompoundDescriptorCalculator(d) 461 o.BuildAtomDict() 462 print('len:', len(o.atomDict.keys())) 463 for key in list(o.atomDict)[-4:-1]: 464 print(key, o.atomDict[key]) 465 466 print('descriptors:', o.GetDescriptorNames()) 467 composList = ['Nb', 'Nb3', 'NbPt', 'Nb2Pt'] 468 for compos in composList: 469 descs = o.CalcSimpleDescriptorsForComposition(compos) 470 print(compos, descs)
471 472 473 if __name__ == '__main__': # pragma: nocover 474 _exampleCode() 475