Package rdkit :: Package Chem :: Package MolDb :: Module Loader_orig
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.MolDb.Loader_orig

  1  # $Id$ 
  2  # 
  3  #  Copyright (C) 2007-2008 Greg Landrum 
  4  #   @@ All Rights Reserved @@ 
  5  #  This file is part of the RDKit. 
  6  #  The contents are covered by the terms of the BSD license 
  7  #  which is included in the file license.txt, found at the root 
  8  #  of the RDKit source tree. 
  9  # 
 10  from rdkit import Chem 
 11  from rdkit.Chem import AllChem 
 12  from rdkit.Chem import Lipinski, Descriptors, Crippen 
 13  from rdkit.Dbase.DbConnection import DbConnect 
 14  from rdkit.Dbase import DbModule 
 15  import re 
 16   
 17  #set up the logger: 
 18  import rdkit.RDLogger as logging 
 19  logger = logging.logger() 
 20  logger.setLevel(logging.INFO) 
 21   
 22   
23 -def ProcessMol(mol, typeConversions, globalProps, nDone, nameProp='_Name', nameCol='compound_id', 24 redraw=False, keepHs=False, skipProps=False, addComputedProps=False, 25 skipSmiles=False, uniqNames=None, namesSeen=None):
26 if not mol: 27 raise ValueError('no molecule') 28 if keepHs: 29 Chem.SanitizeMol(mol) 30 try: 31 nm = mol.GetProp(nameProp) 32 except KeyError: 33 nm = None 34 if not nm: 35 nm = 'Mol_%d' % nDone 36 if uniqNames and nm in namesSeen: 37 logger.error('duplicate compound id (%s) encountered. second instance skipped.' % nm) 38 return None 39 namesSeen.add(nm) 40 row = [nm] 41 if not skipProps: 42 if addComputedProps: 43 nHD = Lipinski.NumHDonors(mol) 44 mol.SetProp('DonorCount', str(nHD)) 45 nHA = Lipinski.NumHAcceptors(mol) 46 mol.SetProp('AcceptorCount', str(nHA)) 47 nRot = Lipinski.NumRotatableBonds(mol) 48 mol.SetProp('RotatableBondCount', str(nRot)) 49 MW = Descriptors.MolWt(mol) 50 mol.SetProp('AMW', str(MW)) 51 logp = Crippen.MolLogP(mol) 52 mol.SetProp('MolLogP', str(logp)) 53 54 pns = list(mol.GetPropNames()) 55 pD = {} 56 for pi, pn in enumerate(pns): 57 if pn.lower() == nameCol.lower(): 58 continue 59 pv = mol.GetProp(pn).strip() 60 if pv.find('>') < 0 and pv.find('<') < 0: 61 colTyp = globalProps.get(pn, 2) 62 while colTyp > 0: 63 try: 64 tpi = typeConversions[colTyp][1](pv) 65 except Exception: 66 colTyp -= 1 67 else: 68 break 69 globalProps[pn] = colTyp 70 pD[pn] = typeConversions[colTyp][1](pv) 71 else: 72 pD[pn] = pv 73 else: 74 pD = {} 75 if redraw: 76 AllChem.Compute2DCoords(m) 77 if not skipSmiles: 78 row.append(Chem.MolToSmiles(mol, True)) 79 row.append(DbModule.binaryHolder(mol.ToBinary())) 80 row.append(pD) 81 return row
82 83
84 -def ConvertRows(rows, globalProps, defaultVal, skipSmiles):
85 for i, row in enumerate(rows): 86 newRow = [row[0], row[1]] 87 pD = row[-1] 88 for pn in globalProps: 89 pv = pD.get(pn, defaultVal) 90 newRow.append(pv) 91 newRow.append(row[2]) 92 if not skipSmiles: 93 newRow.append(row[3]) 94 rows[i] = newRow
95 96
97 -def LoadDb(suppl, dbName, nameProp='_Name', nameCol='compound_id', silent=False, redraw=False, 98 errorsTo=None, keepHs=False, defaultVal='N/A', skipProps=False, regName='molecules', 99 skipSmiles=False, maxRowsCached=-1, uniqNames=False, addComputedProps=False, 100 lazySupplier=False, startAnew=True):
101 if not lazySupplier: 102 nMols = len(suppl) 103 else: 104 nMols = -1 105 if not silent: 106 logger.info("Generating molecular database in file %s" % dbName) 107 if not lazySupplier: 108 logger.info(" Processing %d molecules" % nMols) 109 rows = [] 110 globalProps = {} 111 namesSeen = set() 112 nDone = 0 113 typeConversions = {0: ('varchar', str), 1: ('float', float), 2: ('int', int)} 114 for m in suppl: 115 nDone += 1 116 if not m: 117 if errorsTo: 118 if hasattr(suppl, 'GetItemText'): 119 d = suppl.GetItemText(nDone - 1) 120 errorsTo.write(d) 121 else: 122 logger.warning('full error file support not complete') 123 continue 124 125 row = ProcessMol(m, typeConversions, globalProps, nDone, nameProp=nameProp, nameCol=nameCol, 126 redraw=redraw, keepHs=keepHs, skipProps=skipProps, 127 addComputedProps=addComputedProps, skipSmiles=skipSmiles, uniqNames=uniqNames, 128 namesSeen=namesSeen) 129 if row is None: 130 continue 131 rows.append([nDone] + row) 132 if not silent and not nDone % 100: 133 logger.info(' done %d' % nDone) 134 if len(rows) == maxRowsCached: 135 break 136 137 nameDef = '%s varchar not null' % nameCol 138 if uniqNames: 139 nameDef += ' unique' 140 typs = ['guid integer not null primary key', nameDef] 141 pns = [] 142 for pn, v in globalProps.items(): 143 addNm = re.sub(r'[\W]', '_', pn) 144 typs.append('%s %s' % (addNm, typeConversions[v][0])) 145 pns.append(pn.lower()) 146 147 if not skipSmiles: 148 if 'smiles' not in pns: 149 typs.append('smiles varchar') 150 else: 151 typs.append('cansmiles varchar') 152 typs.append('molpkl %s' % (DbModule.binaryTypeName)) 153 conn = DbConnect(dbName) 154 curs = conn.GetCursor() 155 if startAnew: 156 try: 157 curs.execute('drop table %s' % regName) 158 except Exception: 159 pass 160 curs.execute('create table %s (%s)' % (regName, ','.join(typs))) 161 else: 162 curs.execute('select * from %s limit 1' % (regName, )) 163 ocolns = set([x[0] for x in curs.description]) 164 ncolns = set([x.split()[0] for x in typs]) 165 if ncolns != ocolns: 166 raise ValueError('Column names do not match: %s != %s' % (ocolns, ncolns)) 167 curs.execute('select max(guid) from %s' % (regName, )) 168 offset = curs.fetchone()[0] 169 for row in rows: 170 row[0] += offset 171 172 qs = ','.join([DbModule.placeHolder for x in typs]) 173 174 ConvertRows(rows, globalProps, defaultVal, skipSmiles) 175 curs.executemany('insert into %s values (%s)' % (regName, qs), rows) 176 conn.Commit() 177 178 rows = [] 179 while 1: 180 nDone += 1 181 try: 182 m = next(suppl) 183 except StopIteration: 184 break 185 if not m: 186 if errorsTo: 187 if hasattr(suppl, 'GetItemText'): 188 d = suppl.GetItemText(nDone - 1) 189 errorsTo.write(d) 190 else: 191 logger.warning('full error file support not complete') 192 continue 193 tmpProps = {} 194 row = ProcessMol(m, typeConversions, globalProps, nDone, nameProp=nameProp, nameCol=nameCol, 195 redraw=redraw, keepHs=keepHs, skipProps=skipProps, 196 addComputedProps=addComputedProps, skipSmiles=skipSmiles, uniqNames=uniqNames, 197 namesSeen=namesSeen) 198 if not row: 199 continue 200 rows.append([nDone] + row) 201 if not silent and not nDone % 100: 202 logger.info(' done %d' % nDone) 203 if len(rows) == maxRowsCached: 204 ConvertRows(rows, globalProps, defaultVal, skipSmiles) 205 curs.executemany('insert into %s values (%s)' % (regName, qs), rows) 206 conn.Commit() 207 rows = [] 208 if len(rows): 209 ConvertRows(rows, globalProps, defaultVal, skipSmiles) 210 curs.executemany('insert into %s values (%s)' % (regName, qs), rows) 211 conn.Commit()
212