Package rdkit :: Package Chem :: Package MolKey :: Module MolKey
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.MolKey.MolKey

  1  # 
  2  #  Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. 
  3  #  All rights reserved. 
  4  # 
  5  # Redistribution and use in source and binary forms, with or without 
  6  # modification, are permitted provided that the following conditions are 
  7  # met: 
  8  # 
  9  #     * Redistributions of source code must retain the above copyright 
 10  #       notice, this list of conditions and the following disclaimer. 
 11  #     * Redistributions in binary form must reproduce the above 
 12  #       copyright notice, this list of conditions and the following 
 13  #       disclaimer in the documentation and/or other materials provided 
 14  #       with the distribution. 
 15  #     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
 16  #       nor the names of its contributors may be used to endorse or promote 
 17  #       products derived from this software without specific prior written permission. 
 18  # 
 19  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 20  # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 21  # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
 22  # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
 23  # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
 24  # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 25  # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 26  # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
 27  # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 28  # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 29  # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 30  # 
 31  # Created by Greg Landrum based on code from Thomas Mueller 
 32  # August 2012 
 33  # 
 34  try: 
 35    from rdkit.Avalon import pyAvalonTools 
 36  except ImportError: 
 37    raise ImportError("This code requires the RDKit to be built with AvalonTools support") 
 38   
 39  import base64 
 40  from collections import namedtuple 
 41  import hashlib 
 42  import logging 
 43  import os 
 44  import re 
 45  import tempfile 
 46  import uuid 
 47   
 48  from rdkit import Chem 
 49  from rdkit import RDConfig 
 50  from rdkit.Chem.MolKey import InchiInfo 
 51   
 52   
53 -class MolIdentifierException(Exception):
54 pass
55 56
57 -class BadMoleculeException(Exception):
58 pass
59 60 61 MOL_KEY_VERSION = '1' 62 63 ERROR_DICT = dict(BAD_MOLECULE=1, ALIAS_CONVERSION_FAILED=2, TRANSFORMED=4, FRAGMENTS_FOUND=8, 64 EITHER_WARNING=16, STEREO_ERROR=32, DUBIOUS_STEREO_REMOVED=64, ATOM_CLASH=128, 65 ATOM_CHECK_FAILED=256, SIZE_CHECK_FAILED=512, RECHARGED=1024, 66 STEREO_FORCED_BAD=2048, STEREO_TRANSFORMED=4096, TEMPLATE_TRANSFORMED=8192, 67 INCHI_COMPUTATION_ERROR=65536, RDKIT_CONVERSION_ERROR=131072, 68 INCHI_READWRITE_ERROR=262144, NULL_MOL=524288) 69 70 INCHI_COMPUTATION_ERROR = ERROR_DICT['INCHI_COMPUTATION_ERROR'] 71 RDKIT_CONVERSION_ERROR = ERROR_DICT['RDKIT_CONVERSION_ERROR'] 72 INCHI_READWRITE_ERROR = ERROR_DICT['INCHI_READWRITE_ERROR'] 73 NULL_MOL = ERROR_DICT['NULL_MOL'] 74 75 BAD_SET = (pyAvalonTools.StruChkResult.bad_set | INCHI_COMPUTATION_ERROR | RDKIT_CONVERSION_ERROR | 76 INCHI_READWRITE_ERROR | NULL_MOL) 77 78 GET_STEREO_RE = re.compile(r'^InChI=1S(.*?)/(t.*?)/m\d/s1(.*$)') 79 NULL_SMILES_RE = re.compile(r'^\s*$|^\s*NO_STRUCTURE\s*$', re.IGNORECASE) 80 PATTERN_NULL_MOL = r'^([\s0]+[1-9]+[\s]+V[\w]*)' 81 82 CHIRAL_POS = 12 83 84 T_NULL_MOL = (NULL_MOL, '') # the NULL mol result tuple 85 86 stereo_code_dict = {} 87 stereo_code_dict['DEFAULT'] = 0 88 stereo_code_dict['S_ACHIR'] = 1 89 stereo_code_dict['S_ABS'] = 2 90 stereo_code_dict['S_REL'] = 3 91 stereo_code_dict['S_PART'] = 4 92 stereo_code_dict['S_UNKN'] = 5 93 stereo_code_dict['S_ABS_ACHIR'] = 6 94 stereo_code_dict['R_ONE'] = 11 95 stereo_code_dict['R_REL'] = 12 96 stereo_code_dict['R_OTHER'] = 13 97 stereo_code_dict['MX_ENANT'] = 21 98 stereo_code_dict['MX_DIAST'] = 22 99 stereo_code_dict['MX_SP2'] = 31 100 stereo_code_dict['MX_DIAST_ABS'] = 32 101 stereo_code_dict['MX_DIAST_REL'] = 33 102 stereo_code_dict['OTHER'] = 100 103 stereo_code_dict['UNDEFINED'] = 200 104 105
106 -def _fix_all(pat, sbt, my_string):
107 try: 108 new_string = re.sub(pat, sbt, my_string) 109 return new_string 110 except: 111 return None
112 113
114 -def _fix_line_ends(my_string):
115 pat = '\r\n{0,1}' 116 sbt = '\n' 117 return _fix_all(pat, sbt, my_string)
118 119
120 -def _fix_chemdraw_header(my_string):
121 pat = '0V2000' 122 sbt = 'V2000' 123 return _fix_all(pat, sbt, my_string)
124 125
126 -def _ctab_has_atoms(ctab_lines):
127 ''' look at atom count position (line 4, characters 0:3) 128 Return True if the count is >0, False if 0. 129 Throw BadMoleculeException if there are no characters 130 at the required position or if they cannot be converted 131 to a positive integer 132 ''' 133 try: 134 str_a_count = ctab_lines[3][0:3] 135 a_count = int(str_a_count) 136 if a_count < 0: 137 raise BadMoleculeException('Atom count negative') 138 if a_count > 0: 139 rval = True 140 else: 141 rval = False 142 except IndexError: 143 raise BadMoleculeException('Invalid molfile format') 144 except ValueError: 145 raise BadMoleculeException('Expected integer') 146 147 return rval
148 149
150 -def _ctab_remove_chiral_flag(ctab_lines):
151 ''' read the chiral flag (line 4, characters 12:15) 152 and set it to 0. Return True if it was 1, False if 0. 153 Throw BadMoleculeException if there are no characters 154 at the required position or if they where not 0 or 1 155 ''' 156 try: 157 str_a_count = ctab_lines[3][12:15] 158 a_count = int(str_a_count) 159 if a_count == 0: 160 rval = False 161 elif a_count == 1: 162 rval = True 163 orig_line = ctab_lines[3] 164 ctab_lines[3] = orig_line[:CHIRAL_POS] + ' 0' + orig_line[CHIRAL_POS + 3:] 165 else: 166 raise BadMoleculeException('Expected chiral flag 0 or 1') 167 except IndexError: 168 raise BadMoleculeException('Invalid molfile format') 169 except ValueError: 170 raise BadMoleculeException('Expected integer, got {0}'.format(str_a_count)) 171 172 return rval
173 174 175 __initCalled = False 176 177
178 -def initStruchk(configDir=None, logFile=None):
179 global __initCalled 180 if configDir is None: 181 configDir = os.path.join(RDConfig.RDDataDir, 'struchk') 182 if configDir[-1] != os.path.sep: 183 configDir += os.path.sep 184 if logFile is None: 185 fd = tempfile.NamedTemporaryFile(suffix='.log', delete=False) 186 fd.close() 187 logFile = fd.name 188 struchk_init = '''-tm 189 -ta {0}checkfgs.trn 190 -tm 191 -or 192 -ca {0}checkfgs.chk 193 -cc 194 -cl 3 195 -cs 196 -cn 999 197 -l {1}\n'''.format(configDir, logFile) 198 initRes = pyAvalonTools.InitializeCheckMol(struchk_init) 199 if initRes: 200 raise ValueError('bad result from InitializeCheckMol: ' + str(initRes)) 201 __initCalled = True
202 203
204 -def CheckCTAB(ctab, isSmiles=True):
205 if not __initCalled: 206 initStruchk() 207 mol_str = ctab 208 if not mol_str: 209 raise BadMoleculeException('Unexpected blank or NULL molecule') 210 else: 211 mol_str = _fix_line_ends(mol_str) 212 mol_str = _fix_chemdraw_header(mol_str) 213 214 if isSmiles: # branch for NULL_MOL checks 215 if mol_str and NULL_SMILES_RE.match(mol_str): 216 rval = T_NULL_MOL 217 else: 218 rval = pyAvalonTools.CheckMoleculeString(mol_str, isSmiles) 219 else: 220 # decompose the ctab into lines 221 # the line terminator may be \n or \r\n, or even r'\n' 222 ctab_lines = mol_str.split('\n') 223 if len(ctab_lines) <= 3: 224 raise BadMoleculeException('Not enough lines in CTAB') 225 _ctab_remove_chiral_flag(ctab_lines) 226 if not _ctab_has_atoms(ctab_lines): 227 rval = T_NULL_MOL 228 else: # reassemble the ctab lines into one string. 229 mol_str = '\n'.join(ctab_lines) 230 rval = pyAvalonTools.CheckMoleculeString(mol_str, isSmiles) 231 return rval
232 233 234 InchiResult = namedtuple('InchiResult', ['error', 'inchi', 'fixed_ctab']) 235 236
237 -def GetInchiForCTAB(ctab):
238 """ 239 >>> from rdkit.Chem.MolKey import MolKey 240 >>> from rdkit.Avalon import pyAvalonTools 241 >>> res = MolKey.GetInchiForCTAB(pyAvalonTools.Generate2DCoords('c1cn[nH]c1C(Cl)Br',True)) 242 >>> res.inchi 243 'InChI=1/C4H4BrClN2/c5-4(6)3-1-2-7-8-3/h1-2,4H,(H,7,8)/t4?/f/h8H' 244 >>> res = MolKey.GetInchiForCTAB(pyAvalonTools.Generate2DCoords('c1c[nH]nc1C(Cl)Br',True)) 245 >>> res.inchi 246 'InChI=1/C4H4BrClN2/c5-4(6)3-1-2-7-8-3/h1-2,4H,(H,7,8)/t4?/f/h7H' 247 >>> 248 """ 249 inchi = None 250 ctab_str = ctab 251 (strucheck_err, fixed_mol) = CheckCTAB(ctab_str, False) 252 if strucheck_err & BAD_SET: 253 return InchiResult(strucheck_err, None, fixed_mol) 254 255 conversion_err = 0 256 try: 257 r_mol = Chem.MolFromMolBlock(fixed_mol, sanitize=False) 258 if r_mol: 259 inchi = Chem.MolToInchi(r_mol, '/FixedH /SUU') 260 if not inchi: 261 conversion_err = INCHI_COMPUTATION_ERROR 262 else: 263 conversion_err = RDKIT_CONVERSION_ERROR 264 except Chem.InchiReadWriteError: 265 conversion_err = INCHI_READWRITE_ERROR 266 # keep warnings from strucheck 267 return InchiResult(strucheck_err | conversion_err, inchi, fixed_mol)
268 269
270 -def _make_racemate_inchi(inchi):
271 """ Normalize the stereo information (t-layer) to one selected isomer. """ 272 # set stereo type = 3 (racemate) for consistency 273 # reset inverted flag to m0 - not inverted 274 new_stereo = '/m0/s3/' 275 stereo_match = GET_STEREO_RE.match(inchi) 276 if stereo_match: 277 inchi = stereo_match.group(1) + new_stereo + stereo_match.group(2) 278 return inchi
279 280
281 -def _get_identification_string(err, ctab, inchi, stereo_category=None, extra_stereo=None):
282 if err & NULL_MOL: 283 return _get_null_mol_identification_string(extra_stereo) 284 elif err & BAD_SET: # bad molecules get special key 285 return _get_bad_mol_identification_string(ctab, stereo_category, extra_stereo) 286 287 # make key string 288 pieces = [] 289 if inchi: 290 pieces.append(inchi) 291 if not stereo_category: 292 raise MolIdentifierException('Stereo category may not be left undefined') 293 else: 294 pieces.append('ST=' + stereo_category) 295 if extra_stereo: 296 pieces.append('XTR=' + extra_stereo) 297 key_string = '/'.join(pieces) 298 return key_string
299 300
301 -def _get_null_mol_identification_string(extra_stereo):
302 key_string = str(uuid.uuid1()) 303 return key_string
304 305
306 -def _get_bad_mol_identification_string(ctab, stereo_category, extra_stereo):
307 pieces = [] 308 ctab_str = ctab 309 if ctab_str: # make the ctab part of the key if available 310 ctab_str = _fix_line_ends(ctab_str) 311 ctab_str = _fix_chemdraw_header(ctab_str) 312 ctab_str = '\n'.join(ctab_str.split('\n')[3:]) 313 pieces.append(ctab_str.replace('\n', r'\n')) # make a handy one-line string 314 else: 315 pass 316 if stereo_category: # add xtra info if available 317 key_string = 'ST={0}'.format(stereo_category) 318 pieces.append(key_string) 319 if extra_stereo: # add xtra info if available 320 key_string = 'XTR={0}'.format(extra_stereo) 321 pieces.append(key_string) 322 key_string = '/'.join(pieces) 323 return key_string
324 325
326 -def _identify(err, ctab, inchi, stereo_category, extra_structure_desc=None):
327 """ Compute the molecule key based on the inchi string, 328 stereo category as well as extra structure 329 information """ 330 key_string = _get_identification_string(err, ctab, inchi, stereo_category, extra_structure_desc) 331 if key_string: 332 return "{0}|{1}".format( 333 MOL_KEY_VERSION, base64.b64encode(hashlib.md5(key_string.encode('UTF-8')).digest()).decode()) 334 else: 335 return None
336 337
338 -def _get_chiral_identification_string(n_def, n_udf):
339 assert n_def >= 0 340 assert n_udf >= 0 341 id_str = 'OTHER' 342 343 if n_def == 0: # no defined stereocenters 344 if n_udf == 0: # no undefined ones either 345 id_str = 'S_ACHIR' # -> achiral 346 elif n_udf == 1: # one undefined, no defined 347 id_str = 'R_ONE' # -> racemate by convention 348 else: # several undefined, no defined 349 id_str = 'S_UNKN' # -> can't say anything based on the drawing 350 else: # some stereo defined 351 if n_udf == 0: # fully specified stereo 352 id_str = 'S_ABS' # -> absolute stereo 353 else: # multiple possibilities 354 id_str = 'S_PART' # -> assume single compound (can usually be separated) 355 return id_str
356 357
358 -def ErrorBitsToText(err):
359 " returns a list of error bit descriptions for the error code provided " 360 return [k for k, v in ERROR_DICT.items() if (err & v) > 0]
361 362 363 MolKeyResult = namedtuple( 364 'MolKeyResult', ['mol_key', 'error', 'inchi', 'fixed_ctab', 'stereo_code', 'stereo_comment']) 365 366
367 -def GetKeyForCTAB(ctab, stereo_info=None, stereo_comment=None, logger=None):
368 """ 369 >>> from rdkit.Chem.MolKey import MolKey 370 >>> from rdkit.Avalon import pyAvalonTools 371 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1ccccc1C(F)Cl',True)) 372 >>> res.mol_key 373 '1|L7676nfGsSIU33wkx//NCg==' 374 >>> res.stereo_code 375 'R_ONE' 376 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1ccccc1[C@H](F)Cl',True)) 377 >>> res.mol_key 378 '1|Aj38EIxf13RuPDQG2A0UMw==' 379 >>> res.stereo_code 380 'S_ABS' 381 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1ccccc1[C@@H](F)Cl',True)) 382 >>> res.mol_key 383 '1|9ypfMrhxn1w0ncRooN5HXw==' 384 >>> res.stereo_code 385 'S_ABS' 386 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc(C(Br)Cl)c1[C@@H](F)Cl',True)) 387 >>> res.mol_key 388 '1|c96jMSlbn7O9GW5d5uB9Mw==' 389 >>> res.stereo_code 390 'S_PART' 391 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc([C@H](Br)Cl)c1[C@@H](F)Cl',True)) 392 >>> res.mol_key 393 '1|+B+GCEardrJteE8xzYdGLA==' 394 >>> res.stereo_code 395 'S_ABS' 396 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc(C(Br)Cl)c1C(F)Cl',True)) 397 >>> res.mol_key 398 '1|5H9R3LvclagMXHp3Clrc/g==' 399 >>> res.stereo_code 400 'S_UNKN' 401 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc(C(Br)Cl)c1C(F)Cl',True),stereo_info='S_REL') 402 >>> res.mol_key 403 '1|cqKWVsUEY6QNpGCbDaDTYA==' 404 >>> res.stereo_code 405 'S_REL' 406 >>> res.inchi 407 'InChI=1/C8H6BrCl2F/c9-7(10)5-3-1-2-4-6(5)8(11)12/h1-4,7-8H/t7?,8?' 408 409 """ 410 if logger is None: 411 logger = logging 412 try: 413 err, inchi, fixed_mol = GetInchiForCTAB(ctab) 414 except BadMoleculeException: 415 msg = u'Corrupt molecule substituting no-struct: --->\n{0}\n<----'.format(ctab) 416 logger.warn(msg) 417 err = NULL_MOL 418 key = _identify(err, '', '', None, None) 419 return MolKeyResult(key, err, '', '', None, None) 420 421 # read or estimate stereo category and/or extra structure description 422 stereo_category = None 423 extra_structure_desc = stereo_comment 424 if stereo_info: # check stereo_info field for coded stereo category and extra stereo info 425 info_flds = stereo_info.split(' ', 1) 426 code_fld = info_flds[0] 427 if code_fld in stereo_code_dict: 428 stereo_category = code_fld 429 if (not stereo_comment) and len(info_flds) > 1: 430 extra_structure_desc = info_flds[1].strip() 431 else: 432 logger.warn('stereo code {0} not recognized. Using default value for ctab.'.format(code_fld)) 433 434 if not (err & BAD_SET): 435 (n_stereo, n_undef_stereo, is_meso, 436 dummy) = InchiInfo.InchiInfo(inchi).get_sp3_stereo()['main']['non-isotopic'] 437 if stereo_category is None or stereo_category == 'DEFAULT': # compute if not set 438 stereo_category = _get_chiral_identification_string(n_stereo - n_undef_stereo, n_undef_stereo) 439 else: 440 raise NotImplementedError( 441 "currently cannot generate correct keys for molecules with struchk errors") 442 key = _identify(err, fixed_mol, inchi, stereo_category, extra_structure_desc) 443 return MolKeyResult(key, err, inchi, fixed_mol, stereo_category, extra_structure_desc)
444 445 446 # ------------------------------------ 447 # 448 # doctest boilerplate 449 #
450 -def _runDoctests(verbose=None): # pragma: nocover
451 import sys 452 import doctest 453 failed, _ = doctest.testmod(optionflags=doctest.ELLIPSIS, verbose=verbose) 454 sys.exit(failed) 455 456 457 if __name__ == '__main__': # pragma: nocover 458 _runDoctests() 459