Package rdkit :: Package ML :: Package Cluster :: Module ClusterUtils
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Cluster.ClusterUtils

  1  # $Id$ 
  2  # 
  3  # Copyright (C) 2001-2008  greg Landrum 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """utility functions for clustering 
 12   
 13  """ 
 14   
 15   
16 -def GetNodeList(cluster):
17 """returns an ordered list of all nodes below cluster 18 19 the ordering is done using the lengths of the child nodes 20 21 **Arguments** 22 23 - cluster: the cluster in question 24 25 **Returns** 26 27 - a list of the leaves below this cluster 28 29 """ 30 if len(cluster) == 1: 31 return [cluster] 32 else: 33 children = cluster.GetChildren() 34 children.sort(key=lambda x: len(x), reverse=True) 35 res = [] 36 for child in children: 37 res += GetNodeList(child) 38 res += [cluster] 39 return res
40 41
42 -def GetNodesDownToCentroids(cluster, above=1):
43 """returns an ordered list of all nodes below cluster 44 45 46 """ 47 if hasattr(cluster, '_isCentroid'): 48 cluster._aboveCentroid = 0 49 above = -1 50 else: 51 cluster._aboveCentroid = above 52 if len(cluster) == 1: 53 return [cluster] 54 else: 55 res = [] 56 children = cluster.GetChildren() 57 children.sort(key=lambda x: len(x), reverse=True) 58 # children.sort(lambda x, y: cmp(len(y), len(x))) 59 for child in children: 60 res = res + GetNodesDownToCentroids(child, above) 61 res = res + [cluster] 62 return res
63 64
65 -def FindClusterCentroidFromDists(cluster, dists):
66 """ find the point in a cluster which has the smallest summed 67 Euclidean distance to all others 68 69 **Arguments** 70 71 - cluster: the cluster to work with 72 73 - dists: the distance matrix to use for the points 74 75 **Returns** 76 77 - the index of the centroid point 78 79 """ 80 children = cluster.GetPoints() 81 pts = [x.GetData() for x in children] 82 83 best = 1e24 84 bestIdx = -1 85 for pt in pts: 86 dAccum = 0.0 87 # loop over others and add'em up 88 for other in pts: 89 if other != pt: 90 if other > pt: 91 row, col = pt, other 92 else: 93 row, col = other, pt 94 dAccum += dists[col * (col - 1) / 2 + row] 95 if dAccum >= best: 96 # minor efficiency hack 97 break 98 if dAccum < best: 99 best = dAccum 100 bestIdx = pt 101 for i in range(len(pts)): 102 pt = pts[i] 103 if pt != bestIdx: 104 if pt > bestIdx: 105 row, col = bestIdx, pt 106 else: 107 row, col = pt, bestIdx 108 children[i]._distToCenter = dists[col * (col - 1) / 2 + row] 109 else: 110 children[i]._distToCenter = 0.0 111 children[i]._clustCenter = bestIdx 112 cluster._clustCenter = bestIdx 113 cluster._distToCenter = 0.0 114 115 return bestIdx
116 117
118 -def _BreadthFirstSplit(cluster, n):
119 """ *Internal Use Only* 120 121 """ 122 if len(cluster) < n: 123 raise ValueError('Cannot split cluster of length %d into %d pieces' % (len(cluster), n)) 124 if len(cluster) == n: 125 return cluster.GetPoints() 126 clusters = [cluster] 127 nxtIdx = 0 128 for _ in range(n - 1): 129 while nxtIdx < len(clusters) and len(clusters[nxtIdx]) == 1: 130 nxtIdx += 1 131 assert nxtIdx < len(clusters) 132 133 children = clusters[nxtIdx].GetChildren() 134 children.sort(key=lambda x: x.GetMetric(), reverse=True) 135 for child in children: 136 clusters.append(child) 137 del clusters[nxtIdx] 138 return clusters
139 140
141 -def _HeightFirstSplit(cluster, n):
142 """ *Internal Use Only* 143 144 """ 145 if len(cluster) < n: 146 raise ValueError('Cannot split cluster of length %d into %d pieces' % (len(cluster), n)) 147 if len(cluster) == n: 148 return cluster.GetPoints() 149 clusters = [cluster] 150 for _ in range(n - 1): 151 nxtIdx = 0 152 while nxtIdx < len(clusters) and len(clusters[nxtIdx]) == 1: 153 nxtIdx += 1 154 assert nxtIdx < len(clusters) 155 156 children = clusters[nxtIdx].GetChildren() 157 for child in children: 158 clusters.append(child) 159 del clusters[nxtIdx] 160 clusters.sort(key=lambda x: x.GetMetric(), reverse=True) 161 return clusters
162 163
164 -def SplitIntoNClusters(cluster, n, breadthFirst=True):
165 """ splits a cluster tree into a set of branches 166 167 **Arguments** 168 169 - cluster: the root of the cluster tree 170 171 - n: the number of clusters to include in the split 172 173 - breadthFirst: toggles breadth first (vs depth first) cleavage 174 of the cluster tree. 175 176 **Returns** 177 178 - a list of sub clusters 179 180 """ 181 if breadthFirst: 182 return _BreadthFirstSplit(cluster, n) 183 else: 184 return _HeightFirstSplit(cluster, n)
185