1
2
3
4
5
6
7
8
9
10 from rdkit import Chem
11 from rdkit.Chem import AllChem
12 from rdkit.Chem import Lipinski, Descriptors, Crippen
13 from rdkit.Dbase.DbConnection import DbConnect
14 from rdkit.Dbase import DbModule
15 import re
16
17
18 import rdkit.RDLogger as logging
19 logger = logging.logger()
20 logger.setLevel(logging.INFO)
21
22
23 -def ProcessMol(mol, typeConversions, globalProps, nDone, nameProp='_Name', nameCol='compound_id',
24 redraw=False, keepHs=False, skipProps=False, addComputedProps=False,
25 skipSmiles=False, uniqNames=None, namesSeen=None):
26 if not mol:
27 raise ValueError('no molecule')
28 if keepHs:
29 Chem.SanitizeMol(mol)
30 try:
31 nm = mol.GetProp(nameProp)
32 except KeyError:
33 nm = None
34 if not nm:
35 nm = 'Mol_%d' % nDone
36 if uniqNames and nm in namesSeen:
37 logger.error('duplicate compound id (%s) encountered. second instance skipped.' % nm)
38 return None
39 namesSeen.add(nm)
40 row = [nm]
41 if not skipProps:
42 if addComputedProps:
43 nHD = Lipinski.NumHDonors(mol)
44 mol.SetProp('DonorCount', str(nHD))
45 nHA = Lipinski.NumHAcceptors(mol)
46 mol.SetProp('AcceptorCount', str(nHA))
47 nRot = Lipinski.NumRotatableBonds(mol)
48 mol.SetProp('RotatableBondCount', str(nRot))
49 MW = Descriptors.MolWt(mol)
50 mol.SetProp('AMW', str(MW))
51 logp = Crippen.MolLogP(mol)
52 mol.SetProp('MolLogP', str(logp))
53
54 pns = list(mol.GetPropNames())
55 pD = {}
56 for pi, pn in enumerate(pns):
57 if pn.lower() == nameCol.lower():
58 continue
59 pv = mol.GetProp(pn).strip()
60 if pv.find('>') < 0 and pv.find('<') < 0:
61 colTyp = globalProps.get(pn, 2)
62 while colTyp > 0:
63 try:
64 tpi = typeConversions[colTyp][1](pv)
65 except Exception:
66 colTyp -= 1
67 else:
68 break
69 globalProps[pn] = colTyp
70 pD[pn] = typeConversions[colTyp][1](pv)
71 else:
72 pD[pn] = pv
73 else:
74 pD = {}
75 if redraw:
76 AllChem.Compute2DCoords(m)
77 if not skipSmiles:
78 row.append(Chem.MolToSmiles(mol, True))
79 row.append(DbModule.binaryHolder(mol.ToBinary()))
80 row.append(pD)
81 return row
82
83
84 -def ConvertRows(rows, globalProps, defaultVal, skipSmiles):
85 for i, row in enumerate(rows):
86 newRow = [row[0], row[1]]
87 pD = row[-1]
88 for pn in globalProps:
89 pv = pD.get(pn, defaultVal)
90 newRow.append(pv)
91 newRow.append(row[2])
92 if not skipSmiles:
93 newRow.append(row[3])
94 rows[i] = newRow
95
96
97 -def LoadDb(suppl, dbName, nameProp='_Name', nameCol='compound_id', silent=False, redraw=False,
98 errorsTo=None, keepHs=False, defaultVal='N/A', skipProps=False, regName='molecules',
99 skipSmiles=False, maxRowsCached=-1, uniqNames=False, addComputedProps=False,
100 lazySupplier=False, startAnew=True):
101 if not lazySupplier:
102 nMols = len(suppl)
103 else:
104 nMols = -1
105 if not silent:
106 logger.info("Generating molecular database in file %s" % dbName)
107 if not lazySupplier:
108 logger.info(" Processing %d molecules" % nMols)
109 rows = []
110 globalProps = {}
111 namesSeen = set()
112 nDone = 0
113 typeConversions = {0: ('varchar', str), 1: ('float', float), 2: ('int', int)}
114 for m in suppl:
115 nDone += 1
116 if not m:
117 if errorsTo:
118 if hasattr(suppl, 'GetItemText'):
119 d = suppl.GetItemText(nDone - 1)
120 errorsTo.write(d)
121 else:
122 logger.warning('full error file support not complete')
123 continue
124
125 row = ProcessMol(m, typeConversions, globalProps, nDone, nameProp=nameProp, nameCol=nameCol,
126 redraw=redraw, keepHs=keepHs, skipProps=skipProps,
127 addComputedProps=addComputedProps, skipSmiles=skipSmiles, uniqNames=uniqNames,
128 namesSeen=namesSeen)
129 if row is None:
130 continue
131 rows.append([nDone] + row)
132 if not silent and not nDone % 100:
133 logger.info(' done %d' % nDone)
134 if len(rows) == maxRowsCached:
135 break
136
137 nameDef = '%s varchar not null' % nameCol
138 if uniqNames:
139 nameDef += ' unique'
140 typs = ['guid integer not null primary key', nameDef]
141 pns = []
142 for pn, v in globalProps.items():
143 addNm = re.sub(r'[\W]', '_', pn)
144 typs.append('%s %s' % (addNm, typeConversions[v][0]))
145 pns.append(pn.lower())
146
147 if not skipSmiles:
148 if 'smiles' not in pns:
149 typs.append('smiles varchar')
150 else:
151 typs.append('cansmiles varchar')
152 typs.append('molpkl %s' % (DbModule.binaryTypeName))
153 conn = DbConnect(dbName)
154 curs = conn.GetCursor()
155 if startAnew:
156 try:
157 curs.execute('drop table %s' % regName)
158 except Exception:
159 pass
160 curs.execute('create table %s (%s)' % (regName, ','.join(typs)))
161 else:
162 curs.execute('select * from %s limit 1' % (regName, ))
163 ocolns = set([x[0] for x in curs.description])
164 ncolns = set([x.split()[0] for x in typs])
165 if ncolns != ocolns:
166 raise ValueError('Column names do not match: %s != %s' % (ocolns, ncolns))
167 curs.execute('select max(guid) from %s' % (regName, ))
168 offset = curs.fetchone()[0]
169 for row in rows:
170 row[0] += offset
171
172 qs = ','.join([DbModule.placeHolder for x in typs])
173
174 ConvertRows(rows, globalProps, defaultVal, skipSmiles)
175 curs.executemany('insert into %s values (%s)' % (regName, qs), rows)
176 conn.Commit()
177
178 rows = []
179 while 1:
180 nDone += 1
181 try:
182 m = next(suppl)
183 except StopIteration:
184 break
185 if not m:
186 if errorsTo:
187 if hasattr(suppl, 'GetItemText'):
188 d = suppl.GetItemText(nDone - 1)
189 errorsTo.write(d)
190 else:
191 logger.warning('full error file support not complete')
192 continue
193 tmpProps = {}
194 row = ProcessMol(m, typeConversions, globalProps, nDone, nameProp=nameProp, nameCol=nameCol,
195 redraw=redraw, keepHs=keepHs, skipProps=skipProps,
196 addComputedProps=addComputedProps, skipSmiles=skipSmiles, uniqNames=uniqNames,
197 namesSeen=namesSeen)
198 if not row:
199 continue
200 rows.append([nDone] + row)
201 if not silent and not nDone % 100:
202 logger.info(' done %d' % nDone)
203 if len(rows) == maxRowsCached:
204 ConvertRows(rows, globalProps, defaultVal, skipSmiles)
205 curs.executemany('insert into %s values (%s)' % (regName, qs), rows)
206 conn.Commit()
207 rows = []
208 if len(rows):
209 ConvertRows(rows, globalProps, defaultVal, skipSmiles)
210 curs.executemany('insert into %s values (%s)' % (regName, qs), rows)
211 conn.Commit()
212