Package rdkit :: Package Chem :: Package Fingerprints :: Module ClusterMols
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Fingerprints.ClusterMols

  1  # 
  2  #  Copyright (c) 2003-2006 Rational Discovery LLC 
  3  # 
  4  #   @@ All Rights Reserved @@ 
  5  #  This file is part of the RDKit. 
  6  #  The contents are covered by the terms of the BSD license 
  7  #  which is included in the file license.txt, found at the root 
  8  #  of the RDKit source tree. 
  9  # 
 10  """ utility functionality for clustering molecules using fingerprints 
 11   includes a command line app for clustering 
 12   
 13   
 14  Sample Usage: 
 15    python ClusterMols.py  -d data.gdb -t daylight_sig \ 
 16      --idName="CAS_TF" -o clust1.pkl \ 
 17      --actTable="dop_test" --actName="moa_quant" 
 18   
 19  """ 
 20  from __future__ import print_function 
 21   
 22  import numpy 
 23   
 24  from rdkit import DataStructs 
 25  from rdkit.Chem.Fingerprints import FingerprintMols, MolSimilarity 
 26  from rdkit.ML.Cluster import Murtagh 
 27  from rdkit.six.moves import cPickle 
 28   
 29  message = FingerprintMols.message 
 30  error = FingerprintMols.error 
 31   
 32   
33 -def GetDistanceMatrix(data, metric, isSimilarity=1):
34 """ data should be a list of tuples with fingerprints in position 1 35 (the rest of the elements of the tuple are not important) 36 37 Returns the symmetric distance matrix 38 (see ML.Cluster.Resemblance for layout documentation) 39 40 """ 41 nPts = len(data) 42 res = numpy.zeros((nPts * (nPts - 1) / 2), numpy.float) 43 nSoFar = 0 44 for col in range(1, nPts): 45 for row in range(col): 46 fp1 = data[col][1] 47 fp2 = data[row][1] 48 if fp1.GetNumBits() > fp2.GetNumBits(): 49 fp1 = DataStructs.FoldFingerprint(fp1, fp1.GetNumBits() / fp2.GetNumBits()) 50 elif fp2.GetNumBits() > fp1.GetNumBits(): 51 fp2 = DataStructs.FoldFingerprint(fp2, fp2.GetNumBits() / fp1.GetNumBits()) 52 sim = metric(fp1, fp2) 53 if isSimilarity: 54 sim = 1. - sim 55 res[nSoFar] = sim 56 nSoFar += 1 57 return res
58 59
60 -def ClusterPoints(data, metric, algorithmId, haveLabels=False, haveActs=True, 61 returnDistances=False):
62 message('Generating distance matrix.\n') 63 dMat = GetDistanceMatrix(data, metric) 64 message('Clustering\n') 65 clustTree = Murtagh.ClusterData(dMat, len(data), algorithmId, isDistData=1)[0] 66 acts = [] 67 if haveActs and len(data[0]) > 2: 68 # we've got activities... use them: 69 acts = [int(x[2]) for x in data] 70 71 if not haveLabels: 72 labels = ['Mol: %s' % str(x[0]) for x in data] 73 else: 74 labels = [x[0] for x in data] 75 clustTree._ptLabels = labels 76 if acts: 77 clustTree._ptValues = acts 78 for pt in clustTree.GetPoints(): 79 idx = pt.GetIndex() - 1 80 pt.SetName(labels[idx]) 81 if acts: 82 try: 83 pt.SetData(int(acts[idx])) 84 except Exception: 85 pass 86 if not returnDistances: 87 return clustTree 88 else: 89 return clustTree, dMat
90 91
92 -def ClusterFromDetails(details):
93 """ Returns the cluster tree 94 95 """ 96 data = MolSimilarity.GetFingerprints(details) 97 if details.maxMols > 0: 98 data = data[:details.maxMols] 99 if details.outFileName: 100 try: 101 outF = open(details.outFileName, 'wb+') 102 except IOError: 103 error("Error: could not open output file %s for writing\n" % (details.outFileName)) 104 return None 105 else: 106 outF = None 107 108 if not data: 109 return None 110 111 clustTree = ClusterPoints(data, details.metric, details.clusterAlgo, haveLabels=0, haveActs=1) 112 if outF: 113 cPickle.dump(clustTree, outF) 114 return clustTree
115 116 117 _usageDoc = """ 118 Usage: ClusterMols.py [args] <fName> 119 120 If <fName> is provided and no tableName is specified (see below), 121 data will be read from the text file <fName>. Text files delimited 122 with either commas (extension .csv) or tabs (extension .txt) are 123 supported. 124 125 Command line arguments are: 126 127 - -d _dbName_: set the name of the database from which 128 to pull input fingerprint information. 129 130 - -t _tableName_: set the name of the database table 131 from which to pull input fingerprint information 132 133 - --idName=val: sets the name of the id column in the input 134 database. Default is *ID*. 135 136 - -o _outFileName_: name of the output file (output will 137 be a pickle (.pkl) file with the cluster tree) 138 139 - --actTable=val: name of table containing activity values 140 (used to color points in the cluster tree). 141 142 - --actName=val: name of column with activities in the activity 143 table. The values in this column should either be integers or 144 convertible into integers. 145 146 - --SLINK: use the single-linkage clustering algorithm 147 (default is Ward's minimum variance) 148 149 - --CLINK: use the complete-linkage clustering algorithm 150 (default is Ward's minimum variance) 151 152 - --UPGMA: use the group-average clustering algorithm 153 (default is Ward's minimum variance) 154 155 - --dice: use the DICE similarity metric instead of Tanimoto 156 157 - --cosine: use the cosine similarity metric instead of Tanimoto 158 159 - --fpColName=val: name to use for the column which stores 160 fingerprints (in pickled format) in the input db table. 161 Default is *AutoFragmentFP* 162 163 - --minPath=val: minimum path length to be included in 164 fragment-based fingerprints. Default is *2*. 165 166 - --maxPath=val: maximum path length to be included in 167 fragment-based fingerprints. Default is *7*. 168 169 - --nBitsPerHash: number of bits to be set in the output 170 fingerprint for each fragment. Default is *4*. 171 172 - --discrim: use of path-based discriminators to hash bits. 173 Default is *false*. 174 175 - -V: include valence information in the fingerprints 176 Default is *false*. 177 178 - -H: include Hs in the fingerprint 179 Default is *false*. 180 181 - --useMACCS: use the public MACCS keys to do the fingerprinting 182 (instead of a daylight-type fingerprint) 183 184 185 """ 186 if __name__ == '__main__': 187 message("This is ClusterMols\n\n") 188 FingerprintMols._usageDoc = _usageDoc 189 details = FingerprintMols.ParseArgs() 190 ClusterFromDetails(details) 191