1
2
3
4
5
6
7
8
9
10
11 """ Interface to the C++ Murtagh hierarchic clustering code
12
13 """
14 import numpy
15
16 from rdkit.ML.Cluster import Clusters
17 from rdkit.ML.Cluster.Clustering import MurtaghCluster, MurtaghDistCluster
18
19
20 WARDS = 1
21 SLINK = 2
22 CLINK = 3
23 UPGMA = 4
24 MCQUITTY = 5
25 GOWER = 6
26 CENTROID = 7
27
28
29 methods = [
30 ("Ward's Minimum Variance", WARDS, "Ward's Minimum Variance"),
31 ('Average Linkage', UPGMA, 'Group Average Linkage (UPGMA)'),
32 ('Single Linkage', SLINK, 'Single Linkage (SLINK)'),
33 ('Complete Linkage', CLINK, 'Complete Linkage (CLINK)'),
34
35
36 ("Centroid", CENTROID, "Centroid method"),
37 ]
38
39
41 """ *Internal Use Only*
42
43 returns the distance between points i and j in the symmetric
44 distance matrix _dists_
45
46 """
47 if i == j:
48 return 0.0
49 if i > j:
50 i, j = j, i
51 return dists[j * (j - 1) / 2 + i]
52
53
54 -def _ToClusters(data, nPts, ia, ib, crit, isDistData=0):
55 """ *Internal Use Only*
56
57 Converts the results of the Murtagh clustering code into
58 a cluster tree, which is returned in a single-entry list
59
60 """
61 cs = [None] * nPts
62 for i in range(nPts):
63 cs[i] = Clusters.Cluster(metric=0.0, data=i, index=(i + 1))
64
65 nClus = len(ia) - 1
66 for i in range(nClus):
67 idx1 = ia[i] - 1
68 idx2 = ib[i] - 1
69 c1 = cs[idx1]
70 c2 = cs[idx2]
71 newClust = Clusters.Cluster(metric=crit[i], children=[c1, c2], index=nPts + i + 1)
72 cs[idx1] = newClust
73
74 return [newClust]
75
76
78 """ clusters the data points passed in and returns the cluster tree
79
80 **Arguments**
81
82 - data: a list of lists (or array, or whatever) with the input
83 data (see discussion of _isDistData_ argument for the exception)
84
85 - nPts: the number of points to be used
86
87 - method: determines which clustering algorithm should be used.
88 The defined constants for these are:
89 'WARDS, SLINK, CLINK, UPGMA'
90
91 - isDistData: set this toggle when the data passed in is a
92 distance matrix. The distance matrix should be stored
93 symmetrically so that _LookupDist (above) can retrieve
94 the results:
95 for i<j: d_ij = dists[j*(j-1)/2 + i]
96
97
98 **Returns**
99
100 - a single entry list with the cluster tree
101 """
102 data = numpy.array(data)
103 if not isDistData:
104 sz = data.shape[1]
105 ia, ib, crit = MurtaghCluster(data, nPts, sz, method)
106 else:
107 ia, ib, crit = MurtaghDistCluster(data, nPts, method)
108 c = _ToClusters(data, nPts, ia, ib, crit, isDistData=isDistData)
109
110 return c
111