Package rdkit :: Package ML :: Package Descriptors :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Descriptors.Parser

  1  # 
  2  #  Copyright (C) 2001-2004  greg Landrum and Rational Discovery LLC 
  3  #  All Rights Reserved 
  4  # 
  5  """ The "parser" for compound descriptors. 
  6   
  7  I almost hesitate to document this, because it's not the prettiest 
  8  thing the world has ever seen... but it does work (for at least some 
  9  definitions of the word). 
 10   
 11  Rather than getting into the whole mess of writing a parser for the 
 12  compound descriptor expressions, I'm just using string substitutions 
 13  and python's wonderful ability to *eval* code. 
 14   
 15  It would probably be a good idea at some point to replace this with a 
 16  real parser, if only for the flexibility and intelligent error 
 17  messages that would become possible. 
 18   
 19  The general idea is that we're going to deal with expressions where 
 20  atomic descriptors have some kind of method applied to them which 
 21  reduces them to a single number for the entire composition.  Compound 
 22  descriptors (those applicable to the compound as a whole) are not 
 23  operated on by anything in particular (except for standard math stuff). 
 24   
 25  Here's the general flow of things: 
 26   
 27    1) Composition descriptor references ($a, $b, etc.) are replaced with the 
 28       corresponding descriptor names using string subsitution. 
 29       (*_SubForCompoundDescriptors*) 
 30   
 31    2) Atomic descriptor references ($1, $2, etc) are replaced with lookups 
 32       into the atomic dict with "DEADBEEF" in place of the atom name. 
 33       (*_SubForAtomicVars*) 
 34   
 35    3) Calls to Calculator Functions are augmented with a reference to 
 36       the composition and atomic dictionary 
 37       (*_SubMethodArgs*) 
 38   
 39  **NOTE:** 
 40   
 41    anytime we don't know the answer for a descriptor, rather than 
 42    throwing a (completely incomprehensible) exception, we just return 
 43    -666.  So bad descriptor values should stand out like sore thumbs. 
 44   
 45  """ 
 46   
 47  from __future__ import print_function 
 48   
 49  # The wildcard import is required to make functions available for the eval statement 
 50  from math import * 
 51   
 52  from rdkit import RDConfig 
 53   
 54  __DEBUG = False 
 55   
 56  # we do this to allow the use of stuff in the math module 
 57   
 58  # ---------------------- 
 59  # atomic descriptor section 
 60  # ---------------------- 
 61  # these are the methods which can be applied to ATOMIC descriptors. 
 62  knownMethods = ['SUM', 'MIN', 'MAX', 'MEAN', 'AVG', 'DEV', 'HAS'] 
 63   
 64   
65 -def HAS(strArg, composList, atomDict):
66 """ *Calculator Method* 67 68 does a string search 69 70 **Arguments** 71 72 - strArg: the arguments in string form 73 74 - composList: the composition vector 75 76 - atomDict: the atomic dictionary 77 78 **Returns** 79 80 1 or 0 81 82 """ 83 splitArgs = strArg.split(',') 84 if len(splitArgs) > 1: 85 for atom, _ in composList: 86 tStr = splitArgs[0].replace('DEADBEEF', atom) 87 where = eval(tStr) 88 what = eval(splitArgs[1]) 89 if what in where: 90 return 1 91 return 0 92 else: 93 return -666
94 95
96 -def SUM(strArg, composList, atomDict):
97 """ *Calculator Method* 98 99 calculates the sum of a descriptor across a composition 100 101 **Arguments** 102 103 - strArg: the arguments in string form 104 105 - compos: the composition vector 106 107 - atomDict: the atomic dictionary 108 109 **Returns** 110 111 a float 112 113 """ 114 accum = 0.0 115 for atom, num in composList: 116 tStr = strArg.replace('DEADBEEF', atom) 117 accum = accum + eval(tStr) * num 118 return accum
119 120
121 -def MEAN(strArg, composList, atomDict):
122 """ *Calculator Method* 123 124 calculates the average of a descriptor across a composition 125 126 **Arguments** 127 128 - strArg: the arguments in string form 129 130 - compos: the composition vector 131 132 - atomDict: the atomic dictionary 133 134 **Returns** 135 136 a float 137 138 """ 139 accum = 0.0 140 nSoFar = 0 141 for atom, num in composList: 142 tStr = strArg.replace('DEADBEEF', atom) 143 accum = accum + eval(tStr) * num 144 nSoFar = nSoFar + num 145 return accum / nSoFar
146 147 148 AVG = MEAN 149 150
151 -def DEV(strArg, composList, atomDict):
152 """ *Calculator Method* 153 154 calculates the average deviation of a descriptor across a composition 155 156 **Arguments** 157 158 - strArg: the arguments in string form 159 160 - compos: the composition vector 161 162 - atomDict: the atomic dictionary 163 164 **Returns** 165 166 a float 167 168 """ 169 avg = MEAN(strArg, composList, atomDict) 170 accum = 0.0 171 nSoFar = 0.0 172 for atom, num in composList: 173 tStr = strArg.replace('DEADBEEF', atom) 174 accum = accum + abs(eval(tStr) - avg) * num 175 nSoFar = nSoFar + num 176 return accum / nSoFar
177 178
179 -def MIN(strArg, composList, atomDict):
180 """ *Calculator Method* 181 182 calculates the minimum value of a descriptor across a composition 183 184 **Arguments** 185 186 - strArg: the arguments in string form 187 188 - compos: the composition vector 189 190 - atomDict: the atomic dictionary 191 192 **Returns** 193 194 a float 195 196 """ 197 accum = [] 198 for atom, _ in composList: 199 tStr = strArg.replace('DEADBEEF', atom) 200 accum.append(eval(tStr)) 201 return min(accum)
202 203
204 -def MAX(strArg, composList, atomDict):
205 """ *Calculator Method* 206 207 calculates the maximum value of a descriptor across a composition 208 209 **Arguments** 210 211 - strArg: the arguments in string form 212 213 - compos: the composition vector 214 215 - atomDict: the atomic dictionary 216 217 **Returns** 218 219 a float 220 221 """ 222 accum = [] 223 for atom, _ in composList: 224 tStr = strArg.replace('DEADBEEF', atom) 225 accum.append(eval(tStr)) 226 return max(accum)
227 228 # ------------------ 229 # string replacement routines 230 # these are not intended to be called by clients 231 # ------------------ 232 233
234 -def _SubForAtomicVars(cExpr, varList, dictName):
235 """ replace atomic variables with the appropriate dictionary lookup 236 237 *Not intended for client use* 238 239 """ 240 for i in range(len(varList)): 241 cExpr = cExpr.replace('$%d' % (i + 1), '%s["DEADBEEF"]["%s"]' % (dictName, varList[i])) 242 return cExpr
243 244
245 -def _SubForCompoundDescriptors(cExpr, varList, dictName):
246 """ replace compound variables with the appropriate list index 247 248 *Not intended for client use* 249 250 """ 251 for i in range(len(varList)): 252 cExpr = cExpr.replace('$%s' % chr(ord('a') + i), '%s["%s"]' % (dictName, varList[i])) 253 return cExpr
254 255
256 -def _SubMethodArgs(cExpr, knownMethods):
257 """ alters the arguments of calls to calculator methods 258 259 *Not intended for client use* 260 261 This is kind of putrid (and the code ain't so pretty either) 262 The general idea is that the various special methods for atomic 263 descriptors need two extra arguments (the composition and the atomic 264 dict). Rather than make the user type those in, we just find 265 invocations of these methods and fill out the function calls using 266 string replacements. 267 """ 268 res = cExpr 269 for method in knownMethods: 270 p = 0 271 while p != -1 and p < len(res): 272 p = res.find(method, p) 273 if p != -1: 274 p = p + len(method) + 1 275 start = p 276 parenCount = 1 277 while parenCount and p < len(res): 278 if res[p] == ')': 279 parenCount = parenCount - 1 280 elif res[p] == '(': 281 parenCount = parenCount + 1 282 p = p + 1 283 if p <= len(res): 284 res = res[0:start] + "'%s',compos,atomDict" % (res[start:p - 1]) + res[p - 1:] 285 return res
286 287
288 -def CalcSingleCompoundDescriptor(compos, argVect, atomDict, propDict):
289 """ calculates the value of the descriptor for a single compound 290 291 **ARGUMENTS:** 292 293 - compos: a vector/tuple containing the composition 294 information... in the form: 295 '[("Fe",1.),("Pt",2.),("Rh",0.02)]' 296 297 - argVect: a vector/tuple with three elements: 298 299 1) AtomicDescriptorNames: a list/tuple of the names of the 300 atomic descriptors being used. These determine the 301 meaning of $1, $2, etc. in the expression 302 303 2) CompoundDescriptorNames: a list/tuple of the names of the 304 compound descriptors being used. These determine the 305 meaning of $a, $b, etc. in the expression 306 307 3) Expr: a string containing the expression to be used to 308 evaluate the final result. 309 310 - atomDict: 311 a dictionary of atomic descriptors. Each atomic entry is 312 another dictionary containing the individual descriptors 313 and their values 314 315 - propVect: 316 a list of descriptors for the composition. 317 318 **RETURNS:** 319 320 the value of the descriptor, -666 if a problem was encountered 321 322 **NOTE:** 323 324 - because it takes rather a lot of work to get everything set 325 up to calculate a descriptor, if you are calculating the 326 same descriptor for multiple compounds, you probably want to 327 be calling _CalcMultipleCompoundsDescriptor()_. 328 329 """ 330 try: 331 atomVarNames = argVect[0] 332 compositionVarNames = argVect[1] 333 formula = argVect[2] 334 formula = _SubForCompoundDescriptors(formula, compositionVarNames, 'propDict') 335 formula = _SubForAtomicVars(formula, atomVarNames, 'atomDict') 336 evalTarget = _SubMethodArgs(formula, knownMethods) 337 except Exception: 338 if __DEBUG: 339 import traceback 340 print('Sub Failure!') 341 traceback.print_exc() 342 print(evalTarget) 343 print(propDict) 344 raise RuntimeError('Failure 1') 345 else: 346 return -666 347 348 try: 349 v = eval(evalTarget) 350 except Exception: 351 if __DEBUG: 352 import traceback 353 outF = open(RDConfig.RDCodeDir + '/ml/descriptors/log.txt', 'a+') 354 outF.write('#------------------------------\n') 355 outF.write('formula: %s\n' % repr(formula)) 356 outF.write('target: %s\n' % repr(evalTarget)) 357 outF.write('propDict: %s\n' % (repr(propDict))) 358 359 outF.write('keys: %s\n' % (repr(sorted(atomDict)))) 360 outF.close() 361 print('ick!') 362 print('formula:', formula) 363 print('target:', evalTarget) 364 print('propDict:', propDict) 365 print('keys:', atomDict.keys()) 366 traceback.print_exc() 367 raise RuntimeError('Failure 2') 368 else: 369 v = -666 370 return v
371 372
373 -def CalcMultipleCompoundsDescriptor(composVect, argVect, atomDict, propDictList):
374 """ calculates the value of the descriptor for a list of compounds 375 376 **ARGUMENTS:** 377 378 - composVect: a vector of vector/tuple containing the composition 379 information. 380 See _CalcSingleCompoundDescriptor()_ for an explanation of the elements. 381 382 - argVect: a vector/tuple with three elements: 383 384 1) AtomicDescriptorNames: a list/tuple of the names of the 385 atomic descriptors being used. These determine the 386 meaning of $1, $2, etc. in the expression 387 388 2) CompoundDsscriptorNames: a list/tuple of the names of the 389 compound descriptors being used. These determine the 390 meaning of $a, $b, etc. in the expression 391 392 3) Expr: a string containing the expression to be used to 393 evaluate the final result. 394 395 - atomDict: 396 a dictionary of atomic descriptors. Each atomic entry is 397 another dictionary containing the individual descriptors 398 and their values 399 400 - propVectList: 401 a vector of vectors of descriptors for the composition. 402 403 **RETURNS:** 404 405 a vector containing the values of the descriptor for each 406 compound. Any given entry will be -666 if problems were 407 encountered 408 409 """ 410 res = [-666] * len(composVect) 411 try: 412 atomVarNames = argVect[0] 413 compositionVarNames = argVect[1] 414 formula = argVect[2] 415 formula = _SubForCompoundDescriptors(formula, compositionVarNames, 'propDict') 416 formula = _SubForAtomicVars(formula, atomVarNames, 'atomDict') 417 evalTarget = _SubMethodArgs(formula, knownMethods) 418 except Exception: 419 return res 420 for i in range(len(composVect)): 421 propDict = propDictList[i] 422 compos = composVect[i] 423 try: 424 v = eval(evalTarget) 425 except Exception: 426 v = -666 427 res[i] = v 428 return res
429 430 431 # ------------ 432 # Demo/testing code 433 # ------------
434 -def _exampleCode(): # pragma: nocover
435 piece1 = [['d1', 'd2', 's1'], ['d1', 'd2', 's1']] 436 aDict = {'Fe': {'d1': 1., 'd2': 2., 's1': 'abc'}, 'Pt': {'d1': 10., 'd2': 20., 's1': 'def'}} 437 pDict = {'d1': 100., 'd2': 200.} 438 compos = [('Fe', 1), ('Pt', 1)] 439 440 cExprs = ["SUM($1)", "SUM($1)+SUM($2)", "SUM($1)+SUM($1)", "MEAN($1)", "DEV($2)", "MAX($1)", 441 "MIN($1)/MAX($1)", "MIN($2)", "SUM($1)/$a", "sqrt($a+$b)", "SUM((3.*$1)/($2))", 442 'HAS($3,"def")', 'HAS($3,"xyz")', "foo"] 443 444 for cExpr in cExprs: 445 argVect = piece1 + [cExpr] 446 print(cExpr) 447 print(CalcSingleCompoundDescriptor(compos, argVect, aDict, pDict)) 448 print(CalcMultipleCompoundsDescriptor([compos, compos], argVect, aDict, [pDict, pDict])) 449 450 451 if __name__ == '__main__': # pragma: nocover 452 _exampleCode() 453