Package rdkit ::
Package Chem ::
Module BuildFragmentCatalog
|
|
1
2
3
4
5
6
7
8
9
10 """ command line utility for working with FragmentCatalogs (CASE-type analysis)
11
12 **Usage**
13
14 BuildFragmentCatalog [optional args] <filename>
15
16 filename, the name of a delimited text file containing InData, is required
17 for some modes of operation (see below)
18
19 **Command Line Arguments**
20
21 - -n *maxNumMols*: specify the maximum number of molecules to be processed
22
23 - -b: build the catalog and OnBitLists
24 *requires InData*
25
26 - -s: score compounds
27 *requires InData and a Catalog, can use OnBitLists*
28
29 - -g: calculate info gains
30 *requires Scores*
31
32 - -d: show details about high-ranking fragments
33 *requires a Catalog and Gains*
34
35 - --catalog=*filename*: filename with the pickled catalog.
36 If -b is provided, this file will be overwritten.
37
38 - --onbits=*filename*: filename to hold the pickled OnBitLists.
39 If -b is provided, this file will be overwritten
40
41 - --scores=*filename*: filename to hold the text score data.
42 If -s is provided, this file will be overwritten
43
44 - --gains=*filename*: filename to hold the text gains data.
45 If -g is provided, this file will be overwritten
46
47 - --details=*filename*: filename to hold the text details data.
48 If -d is provided, this file will be overwritten.
49
50 - --minPath=2: specify the minimum length for a path
51
52 - --maxPath=6: specify the maximum length for a path
53
54 - --smiCol=1: specify which column in the input data file contains
55 SMILES
56
57 - --actCol=-1: specify which column in the input data file contains
58 activities
59
60 - --nActs=2: specify the number of possible activity values
61
62 - --nBits=-1: specify the maximum number of bits to show details for
63
64 """
65 from __future__ import print_function
66
67 import os
68 import sys
69
70 import numpy
71
72 from rdkit import RDConfig
73 from rdkit.Chem import FragmentCatalog
74 from rdkit.Dbase.DbConnection import DbConnect
75 from rdkit.ML import InfoTheory
76 from rdkit.six import next
77 from rdkit.six.moves import cPickle
78
79
82
83
84 -def BuildCatalog(suppl, maxPts=-1, groupFileName=None, minPath=2, maxPath=6, reportFreq=10):
85 """ builds a fragment catalog from a set of molecules in a delimited text block
86
87 **Arguments**
88
89 - suppl: a mol supplier
90
91 - maxPts: (optional) if provided, this will set an upper bound on the
92 number of points to be considered
93
94 - groupFileName: (optional) name of the file containing functional group
95 information
96
97 - minPath, maxPath: (optional) names of the minimum and maximum path lengths
98 to be considered
99
100 - reportFreq: (optional) how often to display status information
101
102 **Returns**
103
104 a FragmentCatalog
105
106 """
107 if groupFileName is None:
108 groupFileName = os.path.join(RDConfig.RDDataDir, "FunctionalGroups.txt")
109
110 fpParams = FragmentCatalog.FragCatParams(minPath, maxPath, groupFileName)
111 catalog = FragmentCatalog.FragCatalog(fpParams)
112 fgen = FragmentCatalog.FragCatGenerator()
113 if maxPts > 0:
114 nPts = maxPts
115 else:
116 if hasattr(suppl, '__len__'):
117 nPts = len(suppl)
118 else:
119 nPts = -1
120 for i, mol in enumerate(suppl):
121 if i == nPts:
122 break
123 if i and not i % reportFreq:
124 if nPts > -1:
125 message('Done %d of %d, %d paths\n' % (i, nPts, catalog.GetFPLength()))
126 else:
127 message('Done %d, %d paths\n' % (i, catalog.GetFPLength()))
128 fgen.AddFragsFromMol(mol, catalog)
129 return catalog
130
131
132 -def ScoreMolecules(suppl, catalog, maxPts=-1, actName='', acts=None, nActs=2, reportFreq=10):
133 """ scores the compounds in a supplier using a catalog
134
135 **Arguments**
136
137 - suppl: a mol supplier
138
139 - catalog: the FragmentCatalog
140
141 - maxPts: (optional) the maximum number of molecules to be
142 considered
143
144 - actName: (optional) the name of the molecule's activity property.
145 If this is not provided, the molecule's last property will be used.
146
147 - acts: (optional) a sequence of activity values (integers).
148 If not provided, the activities will be read from the molecules.
149
150 - nActs: (optional) number of possible activity values
151
152 - reportFreq: (optional) how often to display status information
153
154 **Returns**
155
156 a 2-tuple:
157
158 1) the results table (a 3D array of ints nBits x 2 x nActs)
159
160 2) a list containing the on bit lists for each molecule
161
162 """
163 nBits = catalog.GetFPLength()
164 resTbl = numpy.zeros((nBits, 2, nActs), numpy.int)
165 obls = []
166
167 if not actName and not acts:
168 actName = suppl[0].GetPropNames()[-1]
169
170 fpgen = FragmentCatalog.FragFPGenerator()
171 suppl.reset()
172 i = 1
173 for mol in suppl:
174 if i and not i % reportFreq:
175 message('Done %d.\n' % (i))
176 if mol:
177 if not acts:
178 act = int(mol.GetProp(actName))
179 else:
180 act = acts[i - 1]
181 fp = fpgen.GetFPForMol(mol, catalog)
182 obls.append([x for x in fp.GetOnBits()])
183 for j in range(nBits):
184 resTbl[j, 0, act] += 1
185 for id_ in obls[i - 1]:
186 resTbl[id_ - 1, 0, act] -= 1
187 resTbl[id_ - 1, 1, act] += 1
188 else:
189 obls.append([])
190 i += 1
191 return resTbl, obls
192
193
194 -def ScoreFromLists(bitLists, suppl, catalog, maxPts=-1, actName='', acts=None, nActs=2,
195 reportFreq=10):
196 """ similar to _ScoreMolecules()_, but uses pre-calculated bit lists
197 for the molecules (this speeds things up a lot)
198
199
200 **Arguments**
201
202 - bitLists: sequence of on bit sequences for the input molecules
203
204 - suppl: the input supplier (we read activities from here)
205
206 - catalog: the FragmentCatalog
207
208 - maxPts: (optional) the maximum number of molecules to be
209 considered
210
211 - actName: (optional) the name of the molecule's activity property.
212 If this is not provided, the molecule's last property will be used.
213
214 - nActs: (optional) number of possible activity values
215
216 - reportFreq: (optional) how often to display status information
217
218 **Returns**
219
220 the results table (a 3D array of ints nBits x 2 x nActs)
221
222 """
223 nBits = catalog.GetFPLength()
224 if maxPts > 0:
225 nPts = maxPts
226 else:
227 nPts = len(bitLists)
228 resTbl = numpy.zeros((nBits, 2, nActs), numpy.int)
229 if not actName and not acts:
230 actName = suppl[0].GetPropNames()[-1]
231 suppl.reset()
232 for i in range(1, nPts + 1):
233 mol = next(suppl)
234 if not acts:
235 act = int(mol.GetProp(actName))
236 else:
237 act = acts[i - 1]
238 if i and not i % reportFreq:
239 message('Done %d of %d\n' % (i, nPts))
240 ids = set()
241 for id_ in bitLists[i - 1]:
242 ids.add(id_ - 1)
243 for j in range(nBits):
244 resTbl[j, 0, act] += 1
245 for id_ in ids:
246 resTbl[id_, 0, act] -= 1
247 resTbl[id_, 1, act] += 1
248 return resTbl
249
250
251 -def CalcGains(suppl, catalog, topN=-1, actName='', acts=None, nActs=2, reportFreq=10, biasList=None,
252 collectFps=0):
253 """ calculates info gains by constructing fingerprints
254 *DOC*
255
256 Returns a 2-tuple:
257 1) gains matrix
258 2) list of fingerprints
259
260 """
261 nBits = catalog.GetFPLength()
262 if topN < 0:
263 topN = nBits
264 if not actName and not acts:
265 actName = suppl[0].GetPropNames()[-1]
266
267 if hasattr(suppl, '__len__'):
268 nMols = len(suppl)
269 else:
270 nMols = -1
271 fpgen = FragmentCatalog.FragFPGenerator()
272
273 if biasList:
274 ranker = InfoTheory.InfoBitRanker(nBits, nActs, InfoTheory.InfoType.BIASENTROPY)
275 ranker.SetBiasList(biasList)
276 else:
277 ranker = InfoTheory.InfoBitRanker(nBits, nActs, InfoTheory.InfoType.ENTROPY)
278 i = 0
279 fps = []
280 for mol in suppl:
281 if not acts:
282 try:
283 act = int(mol.GetProp(actName))
284 except KeyError:
285 message('ERROR: Molecule has no property: %s\n' % (actName))
286 message('\tAvailable properties are: %s\n' % (str(mol.GetPropNames())))
287 raise KeyError(actName)
288 else:
289 act = acts[i]
290 if i and not i % reportFreq:
291 if nMols > 0:
292 message('Done %d of %d.\n' % (i, nMols))
293 else:
294 message('Done %d.\n' % (i))
295 fp = fpgen.GetFPForMol(mol, catalog)
296 ranker.AccumulateVotes(fp, act)
297 i += 1
298 if collectFps:
299 fps.append(fp)
300 gains = ranker.GetTopN(topN)
301 return gains, fps
302
303
304 -def CalcGainsFromFps(suppl, fps, topN=-1, actName='', acts=None, nActs=2, reportFreq=10,
305 biasList=None):
306 """ calculates info gains from a set of fingerprints
307
308 *DOC*
309
310 """
311 nBits = len(fps[0])
312 if topN < 0:
313 topN = nBits
314 if not actName and not acts:
315 actName = suppl[0].GetPropNames()[-1]
316
317 if hasattr(suppl, '__len__'):
318 nMols = len(suppl)
319 else:
320 nMols = -1
321 if biasList:
322 ranker = InfoTheory.InfoBitRanker(nBits, nActs, InfoTheory.InfoType.BIASENTROPY)
323 ranker.SetBiasList(biasList)
324 else:
325 ranker = InfoTheory.InfoBitRanker(nBits, nActs, InfoTheory.InfoType.ENTROPY)
326 for i, mol in enumerate(suppl):
327 if not acts:
328 try:
329 act = int(mol.GetProp(actName))
330 except KeyError:
331 message('ERROR: Molecule has no property: %s\n' % (actName))
332 message('\tAvailable properties are: %s\n' % (str(mol.GetPropNames())))
333 raise KeyError(actName)
334 else:
335 act = acts[i]
336 if i and not i % reportFreq:
337 if nMols > 0:
338 message('Done %d of %d.\n' % (i, nMols))
339 else:
340 message('Done %d.\n' % (i))
341 fp = fps[i]
342 ranker.AccumulateVotes(fp, act)
343 gains = ranker.GetTopN(topN)
344 return gains
345
346
348 actHeaders = ['Act-%d' % (x) for x in range(nActs)]
349 if cat:
350 outF.write('id,Description,Gain,%s\n' % (','.join(actHeaders)))
351 else:
352 outF.write('id,Gain,%s\n' % (','.join(actHeaders)))
353 for entry in gains:
354 id_ = int(entry[0])
355 outL = [str(id_)]
356 if cat:
357 descr = cat.GetBitDescription(id_)
358 outL.append(descr)
359 outL.append('%.6f' % entry[1])
360 outL += ['%d' % x for x in entry[2:]]
361 outF.write(','.join(outL))
362 outF.write('\n')
363
364
366 """ reads a list of ids and info gains out of an input file
367
368 """
369 res = []
370 _ = inF.readline()
371 for line in inF:
372 splitL = line.strip().split(delim)
373 res.append((splitL[idCol], float(splitL[gainCol])))
374 return res
375
376
377 -def ShowDetails(catalog, gains, nToDo=-1, outF=sys.stdout, idCol=0, gainCol=1, outDelim=','):
378 """
379 gains should be a sequence of sequences. The idCol entry of each
380 sub-sequence should be a catalog ID. _ProcessGainsData()_ provides
381 suitable input.
382
383 """
384 if nToDo < 0:
385 nToDo = len(gains)
386 for i in range(nToDo):
387 id_ = int(gains[i][idCol])
388 gain = float(gains[i][gainCol])
389 descr = catalog.GetFragDescription(id_)
390 if descr:
391 outF.write('%s\n' % (outDelim.join((str(id_), descr, str(gain)))))
392
393
427
428
430 print("This is BuildFragmentCatalog")
431 print('usage error')
432
433 sys.exit(-1)
434
435
463
464
466 import getopt
467 try:
468 args, extras = getopt.getopt(sys.argv[1:], 'n:d:cst',
469 ['catalog=', 'onbits=', 'scoresFile=', 'gainsFile=',
470 'detailsFile=', 'fpFile=', 'minPath=', 'maxPath=', 'smiCol=',
471 'actCol=', 'nameCol=', 'nActs=', 'nBits=', 'biasList=', 'topN=',
472 'build', 'sigs', 'gains', 'details', 'score', 'noTitle'])
473 except Exception:
474 sys.stderr.write('Error parsing command line:\n')
475 import traceback
476 traceback.print_exc()
477 Usage()
478 for arg, val in args:
479 if arg == '-n':
480 details.numMols = int(val)
481 elif arg == '-c':
482 details.delim = ','
483 elif arg == '-s':
484 details.delim = ' '
485 elif arg == '-t':
486 details.delim = '\t'
487 elif arg == '-d':
488 details.dbName = val
489 elif arg == '--build':
490 details.doBuild = 1
491 elif arg == '--score':
492 details.doScore = 1
493 elif arg == '--gains':
494 details.doGains = 1
495 elif arg == '--sigs':
496 details.doSigs = 1
497 elif arg == '-details':
498 details.doDetails = 1
499 elif arg == '--catalog':
500 details.catalogName = val
501 elif arg == '--onbits':
502 details.onBitsName = val
503 elif arg == '--scoresFile':
504 details.scoresName = val
505 elif arg == '--gainsFile':
506 details.gainsName = val
507 elif arg == '--detailsFile':
508 details.detailsName = val
509 elif arg == '--fpFile':
510 details.fpName = val
511 elif arg == '--minPath':
512 details.minPath = int(val)
513 elif arg == '--maxPath':
514 details.maxPath = int(val)
515 elif arg == '--smiCol':
516 try:
517 details.smiCol = int(val)
518 except ValueError:
519 details.smiCol = val
520 elif arg == '--actCol':
521 try:
522 details.actCol = int(val)
523 except ValueError:
524 details.actCol = val
525 elif arg == '--nameCol':
526 try:
527 details.nameCol = int(val)
528 except ValueError:
529 details.nameCol = val
530 elif arg == '--nActs':
531 details.nActs = int(val)
532 elif arg == '--nBits':
533 details.nBits = int(val)
534 elif arg == '--noTitle':
535 details.hasTitle = 0
536 elif arg == '--biasList':
537 details.biasList = tuple(eval(val))
538 elif arg == '--topN':
539 details.topN = int(val)
540 elif arg == '-h':
541 Usage()
542 sys.exit(0)
543 else:
544 Usage()
545 if len(extras):
546 if details.dbName:
547 details.tableName = extras[0]
548 else:
549 details.inFileName = extras[0]
550 else:
551 Usage()
552
553
554 if __name__ == '__main__':
555 import time
556 details = RunDetails()
557 ParseArgs(details)
558 from io import StringIO
559 suppl = SupplierFromDetails(details)
560
561 cat = None
562 obls = None
563 if details.doBuild:
564 if not suppl:
565 message("We require inData to generate a catalog\n")
566 sys.exit(-2)
567 message("Building catalog\n")
568 t1 = time.time()
569 cat = BuildCatalog(suppl, maxPts=details.numMols, minPath=details.minPath,
570 maxPath=details.maxPath)
571 t2 = time.time()
572 message("\tThat took %.2f seconds.\n" % (t2 - t1))
573 if details.catalogName:
574 message("Dumping catalog data\n")
575 cPickle.dump(cat, open(details.catalogName, 'wb+'))
576 elif details.catalogName:
577 message("Loading catalog\n")
578 cat = cPickle.load(open(details.catalogName, 'rb'))
579 if details.onBitsName:
580 try:
581 obls = cPickle.load(open(details.onBitsName, 'rb'))
582 except Exception:
583 obls = None
584 else:
585 if len(obls) < (inD.count('\n') - 1):
586 obls = None
587 scores = None
588 if details.doScore:
589 if not suppl:
590 message("We require inData to score molecules\n")
591 sys.exit(-2)
592 if not cat:
593 message("We require a catalog to score molecules\n")
594 sys.exit(-2)
595 message("Scoring compounds\n")
596 if not obls or len(obls) < details.numMols:
597 scores, obls = ScoreMolecules(suppl, cat, maxPts=details.numMols, actName=details.actCol,
598 nActs=details.nActs)
599 if details.scoresName:
600 cPickle.dump(scores, open(details.scoresName, 'wb+'))
601 if details.onBitsName:
602 cPickle.dump(obls, open(details.onBitsName, 'wb+'))
603 else:
604 scores = ScoreFromLists(obls, suppl, cat, maxPts=details.numMols, actName=details.actCol,
605 nActs=details.nActs)
606 elif details.scoresName:
607 scores = cPickle.load(open(details.scoresName, 'rb'))
608
609 if details.fpName and os.path.exists(details.fpName) and not details.doSigs:
610 message("Reading fingerprints from file.\n")
611 fps = cPickle.load(open(details.fpName, 'rb'))
612 else:
613 fps = []
614 gains = None
615 if details.doGains:
616 if not suppl:
617 message("We require inData to calculate gains\n")
618 sys.exit(-2)
619 if not (cat or fps):
620 message("We require either a catalog or fingerprints to calculate gains\n")
621 sys.exit(-2)
622 message("Calculating Gains\n")
623 t1 = time.time()
624 if details.fpName:
625 collectFps = 1
626 else:
627 collectFps = 0
628 if not fps:
629 gains, fps = CalcGains(suppl, cat, topN=details.topN, actName=details.actCol,
630 nActs=details.nActs, biasList=details.biasList, collectFps=collectFps)
631 if details.fpName:
632 message("Writing fingerprint file.\n")
633 tmpF = open(details.fpName, 'wb+')
634 cPickle.dump(fps, tmpF, 1)
635 tmpF.close()
636 else:
637 gains = CalcGainsFromFps(suppl, fps, topN=details.topN, actName=details.actCol,
638 nActs=details.nActs, biasList=details.biasList)
639 t2 = time.time()
640 message("\tThat took %.2f seconds.\n" % (t2 - t1))
641 if details.gainsName:
642 outF = open(details.gainsName, 'w+')
643 OutputGainsData(outF, gains, cat, nActs=details.nActs)
644 else:
645 if details.gainsName:
646 inF = open(details.gainsName, 'r')
647 gains = ProcessGainsData(inF)
648
649 if details.doDetails:
650 if not cat:
651 message("We require a catalog to get details\n")
652 sys.exit(-2)
653 if not gains:
654 message("We require gains data to get details\n")
655 sys.exit(-2)
656 io = StringIO()
657 io.write('id,SMILES,gain\n')
658 ShowDetails(cat, gains, nToDo=details.nBits, outF=io)
659 if details.detailsName:
660 open(details.detailsName, 'w+').write(io.getvalue())
661 else:
662 sys.stderr.write(io.getvalue())
663