1
2
3
4
5
6
7
8
9
10 """ utility functionality for molecular similarity
11 includes a command line app for screening databases
12
13
14 Sample Usage:
15
16 python MolSimilarity.py -d data.gdb -t daylight_sig --idName="Mol_ID" \
17 --topN=100 --smiles='c1(C=O)ccc(Oc2ccccc2)cc1' --smilesTable=raw_dop_data \
18 --smilesName="structure" -o results.csv
19
20 """
21 import types
22
23 from rdkit import Chem
24 from rdkit import DataStructs
25 from rdkit.Chem.Fingerprints import FingerprintMols, DbFpSupplier
26 from rdkit.DataStructs.TopNContainer import TopNContainer
27 from rdkit.Dbase import DbModule
28 from rdkit.Dbase.DbConnection import DbConnect
29 from rdkit.six.moves import cPickle
30
31 try:
32 from rdkit.VLib.NodeLib.DbPickleSupplier import _lazyDataSeq as _dataSeq
33 except ImportError:
34 _dataSeq = None
35
36
38 fields = '%s.%s' % (details.tableName, details.idName)
39 join = ''
40 if details.smilesTableName:
41 if details.smilesName:
42 fields = fields + ',%s' % (details.smilesName)
43 join = 'join %s smi on smi.%s=%s.%s' % (details.smilesTableName, details.idName,
44 details.tableName, details.idName)
45 if details.actTableName:
46 if details.actName:
47 fields = fields + ',%s' % (details.actName)
48 join = join + 'join %s act on act.%s=%s.%s' % (details.actTableName, details.idName,
49 details.tableName, details.idName)
50
51 if extraFields:
52 fields += ',' + extraFields
53 cmd = 'select %s from %s %s' % (fields, details.tableName, join)
54 return cmd
55
56
58 try:
59 probeFp = apply(FingerprintMols.FingerprintMol, (mol, ), details.__dict__)
60 except Exception:
61 import traceback
62 FingerprintMols.error('Error: problems fingerprinting molecule.\n')
63 traceback.print_exc()
64 return []
65 if details.dbName and details.tableName:
66 try:
67 conn = DbConnect(details.dbName, details.tableName)
68 if hasattr(details, 'dbUser'):
69 conn.user = details.dbUser
70 if hasattr(details, 'dbPassword'):
71 conn.password = details.dbPassword
72 except Exception:
73 import traceback
74 FingerprintMols.error('Error: Problems establishing connection to database: %s|%s\n' %
75 (details.dbName, details.tableName))
76 traceback.print_exc()
77
78 if details.metric not in (DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity,
79 DataStructs.CosineSimilarity):
80 data = GetFingerprints(details)
81 res = ScreenFingerprints(details, data, mol)
82 else:
83 res = []
84 if details.metric == DataStructs.TanimotoSimilarity:
85 func = 'rd_tanimoto'
86 pkl = probeFp.ToBitString()
87 elif details.metric == DataStructs.DiceSimilarity:
88 func = 'rd_dice'
89 pkl = probeFp.ToBitString()
90 elif details.metric == DataStructs.CosineSimilarity:
91 func = 'rd_cosine'
92 pkl = probeFp.ToBitString()
93 extraFields = "%s(%s,%s) as tani" % (func, DbModule.placeHolder, details.fpColName)
94 cmd = _ConstructSQL(details, extraFields=extraFields)
95
96 if details.doThreshold:
97
98 cmd = "select * from (%s) tmp where tani>%f" % (cmd, details.screenThresh)
99 cmd += " order by tani desc"
100 if not details.doThreshold and details.topN > 0:
101 cmd += " limit %d" % details.topN
102 curs = conn.GetCursor()
103 curs.execute(cmd, (pkl, ))
104 res = curs.fetchall()
105
106 return res
107
108
110 """ returns an iterable sequence of fingerprints
111 each fingerprint will have a _fieldsFromDb member whose first entry is
112 the id.
113
114 """
115 if details.dbName and details.tableName:
116 try:
117 conn = DbConnect(details.dbName, details.tableName)
118 if hasattr(details, 'dbUser'):
119 conn.user = details.dbUser
120 if hasattr(details, 'dbPassword'):
121 conn.password = details.dbPassword
122 except Exception:
123 import traceback
124 FingerprintMols.error('Error: Problems establishing connection to database: %s|%s\n' %
125 (details.dbName, details.tableName))
126 traceback.print_exc()
127 cmd = _ConstructSQL(details, extraFields=details.fpColName)
128 curs = conn.GetCursor()
129
130
131 if _dataSeq:
132 suppl = _dataSeq(curs, cmd, depickle=not details.noPickle, klass=DataStructs.ExplicitBitVect)
133 _dataSeq._conn = conn
134 else:
135 suppl = DbFpSupplier.ForwardDbFpSupplier(data, fpColName=details.fpColName)
136 elif details.inFileName:
137 conn = None
138 try:
139 inF = open(details.inFileName, 'r')
140 except IOError:
141 import traceback
142 FingerprintMols.error('Error: Problems reading from file %s\n' % (details.inFileName))
143 traceback.print_exc()
144
145 suppl = []
146 done = 0
147 while not done:
148 try:
149 ID, fp = cPickle.load(inF)
150 except Exception:
151 done = 1
152 else:
153 fp._fieldsFromDb = [ID]
154 suppl.append(fp)
155 else:
156 suppl = None
157
158 return suppl
159
160
162 """ Returns a list of results
163
164 """
165 if probeFp is None:
166 try:
167 probeFp = apply(FingerprintMols.FingerprintMol, (mol, ), details.__dict__)
168 except Exception:
169 import traceback
170 FingerprintMols.error('Error: problems fingerprinting molecule.\n')
171 traceback.print_exc()
172 return []
173 if not probeFp:
174 return []
175
176 res = []
177 if not details.doThreshold and details.topN > 0:
178 topN = TopNContainer(details.topN)
179 else:
180 topN = []
181 res = []
182 count = 0
183 for pt in data:
184 fp1 = probeFp
185 if not details.noPickle:
186 if type(pt) in (types.TupleType, types.ListType):
187 ID, fp = pt
188 else:
189 fp = pt
190 ID = pt._fieldsFromDb[0]
191 score = DataStructs.FingerprintSimilarity(fp1, fp, details.metric)
192 else:
193 ID, pkl = pt
194 score = details.metric(fp1, str(pkl))
195 if topN:
196 topN.Insert(score, ID)
197 elif not details.doThreshold or \
198 (details.doThreshold and score >= details.screenThresh):
199 res.append((ID, score))
200 count += 1
201 if hasattr(details, 'stopAfter') and count >= details.stopAfter:
202 break
203 for score, ID in topN:
204 res.append((ID, score))
205
206 return res
207
208
210 """ Returns a list of results
211
212 """
213 if not mol:
214 if not details.probeMol:
215 smi = details.probeSmiles
216 try:
217 mol = Chem.MolFromSmiles(smi)
218 except Exception:
219 import traceback
220 FingerprintMols.error('Error: problems generating molecule for smiles: %s\n' % (smi))
221 traceback.print_exc()
222 return
223 else:
224 mol = details.probeMol
225 if not mol:
226 return
227
228 if details.outFileName:
229 try:
230 outF = open(details.outFileName, 'w+')
231 except IOError:
232 FingerprintMols.error("Error: could not open output file %s for writing\n" %
233 (details.outFileName))
234 return None
235 else:
236 outF = None
237
238 if not hasattr(details, 'useDbSimilarity') or not details.useDbSimilarity:
239 data = GetFingerprints(details)
240 res = ScreenFingerprints(details, data, mol)
241 else:
242 res = ScreenInDb(details, mol)
243 if outF:
244 for pt in res:
245 outF.write(','.join([str(x) for x in pt]))
246 outF.write('\n')
247 return res
248
249
250 _usageDoc = """
251 Usage: MolSimilarity.py [args] <fName>
252
253 If <fName> is provided and no tableName is specified (see below),
254 data will be read from the pickled file <fName>. This file should
255 contain a series of pickled (ID,fingerprint) tuples.
256
257 NOTE: at the moment the user is responsible for ensuring that the
258 fingerprint parameters given at run time (used to fingerprint the
259 probe molecule) match those used to generate the input fingerprints.
260
261 Command line arguments are:
262 - --smiles=val: sets the SMILES for the input molecule. This is
263 a required argument.
264
265 - -d _dbName_: set the name of the database from which
266 to pull input fingerprint information.
267
268 - -t _tableName_: set the name of the database table
269 from which to pull input fingerprint information
270
271 - --smilesTable=val: sets the name of the database table
272 which contains SMILES for the input fingerprints. If this
273 information is provided along with smilesName (see below),
274 the output file will contain SMILES data
275
276 - --smilesName=val: sets the name of the SMILES column
277 in the input database. Default is *SMILES*.
278
279 - --topN=val: sets the number of results to return.
280 Default is *10*.
281
282 - --thresh=val: sets the similarity threshold.
283
284 - --idName=val: sets the name of the id column in the input
285 database. Default is *ID*.
286
287 - -o _outFileName_: name of the output file (output will
288 be a CSV file with one line for each of the output molecules
289
290 - --dice: use the DICE similarity metric instead of Tanimoto
291
292 - --cosine: use the cosine similarity metric instead of Tanimoto
293
294 - --fpColName=val: name to use for the column which stores
295 fingerprints (in pickled format) in the output db table.
296 Default is *AutoFragmentFP*
297
298 - --minPath=val: minimum path length to be included in
299 fragment-based fingerprints. Default is *1*.
300
301 - --maxPath=val: maximum path length to be included in
302 fragment-based fingerprints. Default is *7*.
303
304 - --nBitsPerHash: number of bits to be set in the output
305 fingerprint for each fragment. Default is *4*.
306
307 - --discrim: use of path-based discriminators to hash bits.
308 Default is *false*.
309
310 - -V: include valence information in the fingerprints
311 Default is *false*.
312
313 - -H: include Hs in the fingerprint
314 Default is *false*.
315
316 - --useMACCS: use the public MACCS keys to do the fingerprinting
317 (instead of a daylight-type fingerprint)
318
319
320 """
321 if __name__ == '__main__':
322 FingerprintMols.message("This is MolSimilarity\n\n")
323 FingerprintMols._usageDoc = _usageDoc
324 details = FingerprintMols.ParseArgs()
325 ScreenFromDetails(details)
326