Package rdkit :: Package Chem :: Package Fingerprints :: Module FingerprintMols
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Fingerprints.FingerprintMols

  1  # 
  2  #  Copyright (c) 2003-2006 Rational Discovery LLC 
  3  # 
  4  #   @@ All Rights Reserved @@ 
  5  #  This file is part of the RDKit. 
  6  #  The contents are covered by the terms of the BSD license 
  7  #  which is included in the file license.txt, found at the root 
  8  #  of the RDKit source tree. 
  9  # 
 10  """ utility functionality for fingerprinting sets of molecules 
 11   includes a command line app for working with fingerprints 
 12   and databases 
 13   
 14   
 15  Sample Usage: 
 16   
 17    python FingerprintMols.py  -d data.gdb \ 
 18          -t 'raw_dop_data' --smilesName="Structure" --idName="Mol_ID"  \ 
 19          --outTable="daylight_sig" 
 20   
 21   
 22  """ 
 23   
 24  from __future__ import print_function 
 25   
 26  import getopt 
 27  import sys 
 28   
 29  from rdkit import Chem 
 30  from rdkit import DataStructs 
 31  from rdkit.Chem import MACCSkeys 
 32  from rdkit.ML.Cluster import Murtagh 
 33  from rdkit.six.moves import cPickle 
 34   
 35   
36 -def error(msg):
37 sys.stderr.write(msg)
38 39
40 -def message(msg):
41 sys.stderr.write(msg)
42 43
44 -def GetRDKFingerprint(mol):
45 """ uses default parameters """ 46 details = FingerprinterDetails() 47 return apply(FingerprintMol, (mol, ), details.__dict__)
48 49
50 -def FoldFingerprintToTargetDensity(fp, **fpArgs):
51 nOn = fp.GetNumOnBits() 52 nTot = fp.GetNumBits() 53 while (float(nOn) / nTot < fpArgs['tgtDensity']): 54 if nTot / 2 > fpArgs['minSize']: 55 fp = DataStructs.FoldFingerprint(fp, 2) 56 nOn = fp.GetNumOnBits() 57 nTot = fp.GetNumBits() 58 else: 59 break 60 return fp
61 62
63 -def FingerprintMol(mol, fingerprinter=Chem.RDKFingerprint, **fpArgs):
64 if not fpArgs: 65 details = FingerprinterDetails() 66 fpArgs = details.__dict__ 67 68 if fingerprinter != Chem.RDKFingerprint: 69 fp = fingerprinter(mol, **fpArgs) 70 fp = FoldFingerprintToTargetDensity(fp, **fpArgs) 71 else: 72 fp = fingerprinter(mol, fpArgs['minPath'], fpArgs['maxPath'], fpArgs['fpSize'], 73 fpArgs['bitsPerHash'], fpArgs['useHs'], fpArgs['tgtDensity'], 74 fpArgs['minSize']) 75 return fp
76 77
78 -def FingerprintsFromSmiles(dataSource, idCol, smiCol, fingerprinter=Chem.RDKFingerprint, 79 reportFreq=10, maxMols=-1, **fpArgs):
80 """ fpArgs are passed as keyword arguments to the fingerprinter 81 82 Returns a list of 2-tuples: (ID,fp) 83 84 """ 85 res = [] 86 nDone = 0 87 for entry in dataSource: 88 ID, smi = str(entry[idCol]), str(entry[smiCol]) 89 mol = Chem.MolFromSmiles(smi) 90 if mol is not None: 91 fp = FingerprintMol(mol, fingerprinter, **fpArgs) 92 res.append((ID, fp)) 93 nDone += 1 94 if reportFreq > 0 and not nDone % reportFreq: 95 message('Done %d molecules\n' % (nDone)) 96 if maxMols > 0 and nDone >= maxMols: 97 break 98 else: 99 error('Problems parsing SMILES: %s\n' % smi) 100 return res
101 102
103 -def FingerprintsFromMols(mols, fingerprinter=Chem.RDKFingerprint, reportFreq=10, maxMols=-1, 104 **fpArgs):
105 """ fpArgs are passed as keyword arguments to the fingerprinter 106 107 Returns a list of 2-tuples: (ID,fp) 108 109 """ 110 res = [] 111 nDone = 0 112 for ID, mol in mols: 113 if mol: 114 fp = FingerprintMol(mol, fingerprinter, **fpArgs) 115 res.append((ID, fp)) 116 nDone += 1 117 if reportFreq > 0 and not nDone % reportFreq: 118 message('Done %d molecules\n' % (nDone)) 119 if maxMols > 0 and nDone >= maxMols: 120 break 121 else: 122 error('Problems parsing SMILES: %s\n' % smi) 123 return res
124 125
126 -def FingerprintsFromPickles(dataSource, idCol, pklCol, fingerprinter=Chem.RDKFingerprint, 127 reportFreq=10, maxMols=-1, **fpArgs):
128 """ fpArgs are passed as keyword arguments to the fingerprinter 129 130 Returns a list of 2-tuples: (ID,fp) 131 132 """ 133 res = [] 134 nDone = 0 135 for entry in dataSource: 136 ID, pkl = str(entry[idCol]), str(entry[pklCol]) 137 mol = Chem.Mol(pkl) 138 if mol is not None: 139 fp = FingerprintMol(mol, fingerprinter, **fpArgs) 140 res.append((ID, fp)) 141 nDone += 1 142 if reportFreq > 0 and not nDone % reportFreq: 143 message('Done %d molecules\n' % (nDone)) 144 if maxMols > 0 and nDone >= maxMols: 145 break 146 else: 147 error('Problems parsing pickle for ID: %s\n' % ID) 148 return res
149 150
151 -def FingerprintsFromDetails(details, reportFreq=10):
152 data = None 153 if details.dbName and details.tableName: 154 from rdkit.Dbase.DbConnection import DbConnect 155 from rdkit.Dbase import DbInfo 156 from rdkit.ML.Data import DataUtils 157 try: 158 conn = DbConnect(details.dbName, details.tableName) 159 except Exception: 160 import traceback 161 error('Problems establishing connection to database: %s|%s\n' % (details.dbName, 162 details.tableName)) 163 traceback.print_exc() 164 if not details.idName: 165 details.idName = DbInfo.GetColumnNames(details.dbName, details.tableName)[0] 166 dataSet = DataUtils.DBToData(details.dbName, details.tableName, 167 what='%s,%s' % (details.idName, details.smilesName)) 168 idCol = 0 169 smiCol = 1 170 elif details.inFileName and details.useSmiles: 171 from rdkit.ML.Data import DataUtils 172 conn = None 173 if not details.idName: 174 details.idName = 'ID' 175 try: 176 dataSet = DataUtils.TextFileToData(details.inFileName, 177 onlyCols=[details.idName, details.smilesName]) 178 except IOError: 179 import traceback 180 error('Problems reading from file %s\n' % (details.inFileName)) 181 traceback.print_exc() 182 183 idCol = 0 184 smiCol = 1 185 elif details.inFileName and details.useSD: 186 conn = None 187 dataset = None 188 if not details.idName: 189 details.idName = 'ID' 190 dataSet = [] 191 try: 192 s = Chem.SDMolSupplier(details.inFileName) 193 except Exception: 194 import traceback 195 error('Problems reading from file %s\n' % (details.inFileName)) 196 traceback.print_exc() 197 else: 198 while 1: 199 try: 200 m = s.next() 201 except StopIteration: 202 break 203 if m: 204 dataSet.append(m) 205 if reportFreq > 0 and not len(dataSet) % reportFreq: 206 message('Read %d molecules\n' % (len(dataSet))) 207 if details.maxMols > 0 and len(dataSet) >= details.maxMols: 208 break 209 210 for i, mol in enumerate(dataSet): 211 if mol.HasProp(details.idName): 212 nm = mol.GetProp(details.idName) 213 else: 214 nm = mol.GetProp('_Name') 215 dataSet[i] = (nm, mol) 216 else: 217 dataSet = None 218 219 fps = None 220 if dataSet and not details.useSD: 221 data = dataSet.GetNamedData() 222 if not details.molPklName: 223 fps = apply(FingerprintsFromSmiles, (data, idCol, smiCol), details.__dict__) 224 else: 225 fps = apply(FingerprintsFromPickles, (data, idCol, smiCol), details.__dict__) 226 elif dataSet and details.useSD: 227 fps = apply(FingerprintsFromMols, (dataSet, ), details.__dict__) 228 229 if fps: 230 if details.outFileName: 231 outF = open(details.outFileName, 'wb+') 232 for i in range(len(fps)): 233 cPickle.dump(fps[i], outF) 234 outF.close() 235 dbName = details.outDbName or details.dbName 236 if details.outTableName and dbName: 237 from rdkit.Dbase.DbConnection import DbConnect 238 from rdkit.Dbase import DbUtils, DbModule 239 conn = DbConnect(dbName) 240 # 241 # We don't have a db open already, so we'll need to figure out 242 # the types of our columns... 243 # 244 colTypes = DbUtils.TypeFinder(data, len(data), len(data[0])) 245 typeStrs = DbUtils.GetTypeStrings([details.idName, details.smilesName], colTypes, 246 keyCol=details.idName) 247 cols = '%s, %s %s' % (typeStrs[0], details.fpColName, DbModule.binaryTypeName) 248 249 # FIX: we should really check to see if the table 250 # is already there and, if so, add the appropriate 251 # column. 252 253 # 254 # create the new table 255 # 256 if details.replaceTable or \ 257 details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]: 258 conn.AddTable(details.outTableName, cols) 259 260 # 261 # And add the data 262 # 263 for ID, fp in fps: 264 tpl = ID, DbModule.binaryHolder(fp.ToBinary()) 265 conn.InsertData(details.outTableName, tpl) 266 conn.Commit() 267 return fps
268 # ------------------------------------------------ 269 # 270 # Command line parsing stuff 271 # 272 # ------------------------------------------------ 273 274
275 -class FingerprinterDetails(object):
276 """ class for storing the details of a fingerprinting run, 277 generates sensible defaults on construction 278 279 """ 280
281 - def __init__(self):
282 self._fingerprinterInit() 283 self._screenerInit() 284 self._clusterInit()
285
286 - def _fingerprinterInit(self):
287 self.fingerprinter = Chem.RDKFingerprint 288 self.fpColName = "AutoFragmentFP" 289 self.idName = '' 290 self.dbName = '' 291 self.outDbName = '' 292 self.tableName = '' 293 self.minSize = 64 294 self.fpSize = 2048 295 self.tgtDensity = 0.3 296 self.minPath = 1 297 self.maxPath = 7 298 self.discrimHash = 0 299 self.useHs = 0 300 self.useValence = 0 301 self.bitsPerHash = 2 302 self.smilesName = 'SMILES' 303 self.maxMols = -1 304 self.outFileName = '' 305 self.outTableName = '' 306 self.inFileName = '' 307 self.replaceTable = True 308 self.molPklName = '' 309 self.useSmiles = True 310 self.useSD = False
311
312 - def _screenerInit(self):
313 self.metric = DataStructs.TanimotoSimilarity 314 self.doScreen = '' 315 self.topN = 10 316 self.screenThresh = 0.75 317 self.doThreshold = 0 318 self.smilesTableName = '' 319 self.probeSmiles = '' 320 self.probeMol = None 321 self.noPickle = 0
322
323 - def _clusterInit(self):
324 self.clusterAlgo = Murtagh.WARDS 325 self.actTableName = '' 326 self.actName = ''
327
328 - def GetMetricName(self):
329 if self.metric == DataStructs.TanimotoSimilarity: 330 return 'Tanimoto' 331 elif self.metric == DataStructs.DiceSimilarity: 332 return 'Dice' 333 elif self.metric == DataStructs.CosineSimilarity: 334 return 'Cosine' 335 elif self.metric: 336 return self.metric 337 else: 338 return 'Unknown'
339
340 - def SetMetricFromName(self, name):
341 name = name.upper() 342 if name == "TANIMOTO": 343 self.metric = DataStructs.TanimotoSimilarity 344 elif name == "DICE": 345 self.metric = DataStructs.DiceSimilarity 346 elif name == "COSINE": 347 self.metric = DataStructs.CosineSimilarity
348 349
350 -def Usage():
351 """ prints a usage string and exits 352 353 """ 354 print(_usageDoc) 355 sys.exit(-1)
356 357 358 _usageDoc = """ 359 Usage: FingerprintMols.py [args] <fName> 360 361 If <fName> is provided and no tableName is specified (see below), 362 data will be read from the text file <fName>. Text files delimited 363 with either commas (extension .csv) or tabs (extension .txt) are 364 supported. 365 366 Command line arguments are: 367 - -d _dbName_: set the name of the database from which 368 to pull input molecule information. If output is 369 going to a database, this will also be used for that 370 unless the --outDbName option is used. 371 372 - -t _tableName_: set the name of the database table 373 from which to pull input molecule information 374 375 - --smilesName=val: sets the name of the SMILES column 376 in the input database. Default is *SMILES*. 377 378 - --useSD: Assume that the input file is an SD file, not a SMILES 379 table. 380 381 - --idName=val: sets the name of the id column in the input 382 database. Defaults to be the name of the first db column 383 (or *ID* for text files). 384 385 - -o _outFileName_: name of the output file (output will 386 be a pickle file with one label,fingerprint entry for each 387 molecule). 388 389 - --outTable=val: name of the output db table used to store 390 fingerprints. If this table already exists, it will be 391 replaced. 392 393 - --outDbName: name of output database, if it's being used. 394 Defaults to be the same as the input db. 395 396 - --fpColName=val: name to use for the column which stores 397 fingerprints (in pickled format) in the output db table. 398 Default is *AutoFragmentFP* 399 400 - --maxSize=val: base size of the fingerprints to be generated 401 Default is *2048* 402 403 - --minSize=val: minimum size of the fingerprints to be generated 404 (limits the amount of folding that happens). Default is *64* 405 406 - --density=val: target bit density in the fingerprint. The 407 fingerprint will be folded until this density is 408 reached. Default is *0.3* 409 410 - --minPath=val: minimum path length to be included in 411 fragment-based fingerprints. Default is *1*. 412 413 - --maxPath=val: maximum path length to be included in 414 fragment-based fingerprints. Default is *7*. 415 416 - --nBitsPerHash: number of bits to be set in the output 417 fingerprint for each fragment. Default is *2*. 418 419 - --discrim: use of path-based discriminators to hash bits. 420 Default is *false*. 421 422 - -V: include valence information in the fingerprints 423 Default is *false*. 424 425 - -H: include Hs in the fingerprint 426 Default is *false*. 427 428 - --maxMols=val: sets the maximum number of molecules to be 429 fingerprinted. 430 431 - --useMACCS: use the public MACCS keys to do the fingerprinting 432 (instead of a daylight-type fingerprint) 433 434 """ 435 436
437 -def ParseArgs(details=None):
438 """ parses the command line arguments and returns a 439 _FingerprinterDetails_ instance with the results. 440 441 **Note**: 442 443 - If you make modifications here, please update the global 444 _usageDoc string so the Usage message is up to date. 445 446 - This routine is used by both the fingerprinter, the clusterer and the 447 screener; not all arguments make sense for all applications. 448 449 """ 450 args = sys.argv[1:] 451 try: 452 args, extras = getopt.getopt(args, 453 'HVs:d:t:o:h', 454 [ 455 'minSize=', 456 'maxSize=', 457 'density=', 458 'minPath=', 459 'maxPath=', 460 'bitsPerHash=', 461 'smilesName=', 462 'molPkl=', 463 'useSD', 464 'idName=', 465 'discrim', 466 'outTable=', 467 'outDbName=', 468 'fpColName=', 469 'maxMols=', 470 'useMACCS', 471 'keepTable', 472 # SCREENING: 473 'smilesTable=', 474 'doScreen=', 475 'topN=', 476 'thresh=', 477 'smiles=', 478 'dice', 479 'cosine', 480 # CLUSTERING: 481 'actTable=', 482 'actName=', 483 'SLINK', 484 'CLINK', 485 'UPGMA', 486 ]) 487 except Exception: 488 import traceback 489 traceback.print_exc() 490 Usage() 491 492 if details is None: 493 details = FingerprinterDetails() 494 if len(extras): 495 details.inFileName = extras[0] 496 497 for arg, val in args: 498 if arg == '-H': 499 details.useHs = 1 500 elif arg == '-V': 501 details.useValence = 1 502 elif arg == '-d': 503 details.dbName = val 504 elif arg == '-t': 505 details.tableName = val 506 elif arg == '-o': 507 details.outFileName = val 508 elif arg == '--minSize': 509 details.minSize = int(val) 510 elif arg == '--maxSize': 511 details.fpSize = int(val) 512 elif arg == '--density': 513 details.tgtDensity = float(val) 514 elif arg == '--outTable': 515 details.outTableName = val 516 elif arg == '--outDbName': 517 details.outDbName = val 518 elif arg == '--fpColName': 519 details.fpColName = val 520 elif arg == '--minPath': 521 details.minPath = int(val) 522 elif arg == '--maxPath': 523 details.maxPath = int(val) 524 elif arg == '--nBitsPerHash': 525 details.bitsPerHash = int(val) 526 elif arg == '--discrim': 527 details.discrimHash = 1 528 elif arg == '--smilesName': 529 details.smilesName = val 530 elif arg == '--molPkl': 531 details.molPklName = val 532 elif arg == '--useSD': 533 details.useSmiles = False 534 details.useSD = True 535 elif arg == '--idName': 536 details.idName = val 537 elif arg == '--maxMols': 538 details.maxMols = int(val) 539 elif arg == '--useMACCS': 540 details.fingerprinter = MACCSkeys.GenMACCSKeys 541 elif arg == '--keepTable': 542 details.replaceTable = False 543 544 # SCREENER: 545 elif arg == '--smilesTable': 546 details.smilesTableName = val 547 elif arg == '--topN': 548 details.doThreshold = 0 549 details.topN = int(val) 550 elif arg == '--thresh': 551 details.doThreshold = 1 552 details.screenThresh = float(val) 553 elif arg == '--smiles': 554 details.probeSmiles = val 555 elif arg == '--dice': 556 details.metric = DataStructs.DiceSimilarity 557 elif arg == '--cosine': 558 details.metric = DataStructs.CosineSimilarity 559 560 # CLUSTERS: 561 elif arg == '--SLINK': 562 details.clusterAlgo = Murtagh.SLINK 563 elif arg == '--CLINK': 564 details.clusterAlgo = Murtagh.CLINK 565 elif arg == '--UPGMA': 566 details.clusterAlgo = Murtagh.UPGMA 567 elif arg == '--actTable': 568 details.actTableName = val 569 elif arg == '--actName': 570 details.actName = val 571 elif arg == '-h': 572 Usage() 573 return details
574 575 576 if __name__ == '__main__': 577 message("This is FingerprintMols\n\n") 578 details = ParseArgs() 579 FingerprintsFromDetails(details) 580