Package rdkit :: Package ML :: Package Scoring :: Module Scoring
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Scoring.Scoring

  1  """ 
  2  $Id$ 
  3   
  4  Scoring - Calculate rank statistics 
  5   
  6  Created by Sereina Riniker, October 2012 
  7  after a file from Peter Gedeck, Greg Landrum 
  8   
  9  \param scores: ordered list with descending similarity containing 
 10                 active/inactive information 
 11  \param col: column index in scores where active/inactive information is stored 
 12  \param fractions: list of fractions at which the value shall be calculated 
 13  \param alpha: exponential weight 
 14  """ 
 15   
 16  import math 
 17   
 18   
19 -def CalcROC(scores, col):
20 """ Determines a ROC curve """ 21 numMol = len(scores) 22 if numMol == 0: 23 raise ValueError('score list is empty') 24 TPR = [0] * numMol # True positive rate: TP/(TP+FP) 25 TNR = [0] * numMol # True negative rate: TN/(TN+FN) 26 numActives = 0 27 numInactives = 0 28 29 # loop over score list 30 for i in range(numMol): 31 if scores[i][col]: 32 numActives += 1 33 else: 34 numInactives += 1 35 TPR[i] = numActives # TP 36 TNR[i] = numInactives # TN 37 38 # normalize, check that there are actives and inactives 39 if numActives > 0: 40 TPR = [1.0 * i / numActives for i in TPR] 41 if numInactives > 0: 42 TNR = [1.0 * i / numInactives for i in TNR] 43 44 return [TNR, TPR]
45 46
47 -def CalcAUC(scores, col):
48 """ Determines the area under the ROC curve """ 49 # determine the ROC curve 50 roc = CalcROC(scores, col) 51 TNR = roc[0] 52 TPR = roc[1] 53 54 numMol = len(scores) 55 AUC = 0 56 57 # loop over score list 58 for i in range(0, numMol - 1): 59 AUC += (TNR[i + 1] - TNR[i]) * (TPR[i + 1] + TPR[i]) 60 61 return 0.5 * AUC
62 63
64 -def _RIEHelper(scores, col, alpha):
65 numMol = len(scores) 66 alpha = float(alpha) 67 if numMol == 0: 68 raise ValueError('score list is empty') 69 if alpha <= 0.0: 70 raise ValueError('alpha must be greater than zero') 71 72 denom = 1.0 / numMol * ((1 - math.exp(-alpha)) / (math.exp(alpha / numMol) - 1)) 73 numActives = 0 74 sum_exp = 0 75 76 # loop over score list 77 for i in range(numMol): 78 active = scores[i][col] 79 if active: 80 numActives += 1 81 sum_exp += math.exp(-(alpha * (i + 1)) / numMol) 82 83 if numActives > 0: # check that there are actives 84 RIE = sum_exp / (numActives * denom) 85 else: 86 RIE = 0.0 87 88 return RIE, numActives
89 90
91 -def CalcRIE(scores, col, alpha):
92 """ RIE original definded here: 93 Sheridan, R.P., Singh, S.B., Fluder, E.M. & Kearsley, S.K. 94 Protocols for Bridging the Peptide to Nonpeptide Gap in Topological Similarity Searches. 95 J. Chem. Inf. Comp. Sci. 41, 1395-1406 (2001). 96 """ 97 RIE, _ = _RIEHelper(scores, col, alpha) 98 return RIE
99 100
101 -def CalcBEDROC(scores, col, alpha):
102 """ BEDROC original defined here: 103 Truchon, J. & Bayly, C.I. 104 Evaluating Virtual Screening Methods: Good and Bad Metric for the "Early Recognition" 105 Problem. J. Chem. Inf. Model. 47, 488-508 (2007). 106 """ 107 # calculate RIE 108 RIE, numActives = _RIEHelper(scores, col, alpha) 109 110 if numActives > 0: 111 numMol = len(scores) 112 ratio = 1.0 * numActives / numMol 113 RIEmax = (1 - math.exp(-alpha * ratio)) / (ratio * (1 - math.exp(-alpha))) 114 RIEmin = (1 - math.exp(alpha * ratio)) / (ratio * (1 - math.exp(alpha))) 115 116 if RIEmax != RIEmin: 117 BEDROC = (RIE - RIEmin) / (RIEmax - RIEmin) 118 else: # numActives = numMol 119 BEDROC = 1.0 120 else: 121 BEDROC = 0.0 122 123 return BEDROC
124 125
126 -def CalcEnrichment(scores, col, fractions):
127 """ Determines the enrichment factor for a set of fractions """ 128 numMol = len(scores) 129 if numMol == 0: 130 raise ValueError('score list is empty') 131 if len(fractions) == 0: 132 raise ValueError('fraction list is empty') 133 for i in fractions: 134 if i > 1 or i < 0: 135 raise ValueError('fractions must be between [0,1]') 136 137 numPerFrac = [math.ceil(numMol * f) for f in fractions] 138 numPerFrac.append(numMol) 139 numActives = 0 140 enrich = [] 141 142 # loop over score list 143 for i in range(numMol): 144 if i > (numPerFrac[0] - 1) and i > 0: 145 enrich.append(1.0 * numActives * numMol / i) 146 numPerFrac.pop(0) 147 active = scores[i][col] 148 if active: 149 numActives += 1 150 151 if numActives > 0: # check that there are actives 152 enrich = [e / numActives for e in enrich] 153 else: 154 enrich = [0.0] * len(fractions) 155 return enrich
156 # 157 # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. 158 # All rights reserved. 159 # 160 # Redistribution and use in source and binary forms, with or without 161 # modification, are permitted provided that the following conditions are 162 # met: 163 # 164 # * Redistributions of source code must retain the above copyright 165 # notice, this list of conditions and the following disclaimer. 166 # * Redistributions in binary form must reproduce the above 167 # copyright notice, this list of conditions and the following 168 # disclaimer in the documentation and/or other materials provided 169 # with the distribution. 170 # * Neither the name of Novartis Institutes for BioMedical Research Inc. 171 # nor the names of its contributors may be used to endorse or promote 172 # products derived from this software without specific prior written permission. 173 # 174 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 175 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 176 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 177 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 178 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 179 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 180 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 181 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 182 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 183 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 184 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 185 # 186