RDKit
Open-source cheminformatics and machine learning.
MHFP.h
Go to the documentation of this file.
1 //
2 // 2019, Daniel Probst, Reymond Group @ University of Bern
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 /*! \file MHFP.h
12 
13 */
14 #include <RDGeneral/export.h>
15 #ifndef RD_MHFPFPS_H
16 #define RD_MHFPFPS_H
17 #include <string>
18 #include <vector>
19 #include <GraphMol/ROMol.h>
21 
22 class SparseBitVect;
23 namespace RDKit {
24 namespace MHFPFingerprints {
25 const std::string mhfpFingerprintVersion = "1.0.0";
26 
27 namespace FNV {
28  const uint32_t prime = 0x01000193;
29  const uint32_t seed = 0x811C9DC5;
30 
31  //! A simple implementation of the Fowler–Noll–Vo hash function.
32  inline uint32_t
33  hash(const std::string& str, uint32_t hash = seed) {
34  const unsigned char* ptr = (const unsigned char*)str.c_str();
35  size_t len = str.length();
36 
37  while (len--)
38  hash = (*ptr++ ^ hash) * prime;
39 
40  return hash;
41  };
42 }
43 
45 public:
46  //! Constructor
47  /*!
48  \brief Construct a MHFPEncoder
49 
50  The MHFPEncoder class is instantieted with a given number of permutations
51  and a seed. Fingerprints / minhashes created with a different number of
52  permutations or a different seed are not compatible.
53 
54  \param n_permutations the number of permutations used to create hash functions.
55  This will be the dimensionality of the resulting vector.
56  Default: <tt>2048</tt>.
57  \param seed a random seed. Default: <tt>42</tt>.
58  */
59  MHFPEncoder(unsigned int n_permutations = 2048,
60  unsigned int seed = 42);
61 
62  /*!
63  \brief Creates a MinHash from a vector of strings.
64 
65  This method is exposed in order to enable advanced usage of this MHFP
66  implementation such as customizing the properties that are hashed in order
67  to create an MHFP instance. In theory, any number of values that can be
68  represented as strings can be minhashed. This method is called
69  by MHFPEncoder::Encode.
70 
71  \param vec a vector containg strings (e.g. the smiles shingling of a molecule).
72 
73  \returns the MinHash of the input.
74  */
75  std::vector<uint32_t>
76  FromStringArray(const std::vector<std::string>& vec);
77 
78  /*!
79  \brief Creates a MinHash from a list of unsigned integers.
80 
81  This method is exposed in order to enable advanced usage of this MHFP
82  implementation such as MinHashing a sparse array generated by another
83  fingerprint (e.g. Morgan / ECFP).
84 
85  \param vec a vector containg unsigned integers.
86 
87  \returns the MinHash of the input.
88  */
89  std::vector<uint32_t>
90  FromArray(const std::vector<uint32_t>& vec);
91 
92  /*!
93  \brief Creates a molecular shingling based on circular substructures.
94 
95  A molecular shingling is a vector of SMILES that were extracted from and
96  represent a molecule. This method extracts substructures centered at each
97  atom of the molecule with different radii. A molecule with 10 atoms will
98  generate <tt>10 * 3</tt> shingles when a radius of <tt>3</tt> is chosen.
99 
100  \param radius the maximum radius of the substructure that is generated at
101  each atom. Default: <tt>3</tt>.
102  \param rings whether the rings (SSSR) are extrected from the molecule and
103  added to the shingling. Given the molecule
104  <tt>"C1CCCCCC1C(=O)C"</tt>, "<tt>C1CCCCCC1"</tt> would be added
105  to the shingling. Default: <tt>true</tt>.
106  \param isomeric whether the SMILES added to the shingling are isomeric.
107  Default: <tt>false</tt>.
108  \param kekulize whether the SMILES added to the shingling are kekulized.
109  Default: <tt>true</tt>.
110  \param min_radius the minimum radius that is used to extract n-grams.
111  Default: <tt>1</tt>.
112 
113  \returns the shingling of a molecule.
114  */
115  std::vector<std::string>
116  CreateShingling(const ROMol& mol,
117  unsigned char radius = 3,
118  bool rings = true,
119  bool isomeric = false,
120  bool kekulize = true,
121  unsigned char min_radius = 1);
122 
123  //! \overload
124  std::vector<std::string>
125  CreateShingling(const std::string& smiles,
126  unsigned char radius = 3,
127  bool rings = true,
128  bool isomeric = false,
129  bool kekulize = true,
130  unsigned char min_radius = 1);
131 
132  /*!
133  \brief Creates a MinHash vector from a molecule.
134 
135  This methods is a wrapper around MHFPEncoder::CreateShingling and
136  MHFPEncoder::FromStringArray. When a vector of molecules or SMILES is passed
137  and RDKit was compiled with OpenMP, it is parallelized and will speed up by
138  a factor of the number of cores.
139 
140  \param radius the maximum radius of the substructure that is generated at
141  each atom. Default: <tt>3</tt>.
142  \param rings whether the rings (SSSR) are extrected from the molecule and
143  added to the shingling. Given the molecule
144  <tt>"C1CCCCCC1C(=O)C"</tt>, "<tt>C1CCCCCC1"</tt> would be added
145  to the shingling. Default: <tt>true</tt>.
146  \param isomeric whether the SMILES added to the shingling are isomeric.
147  Default: <tt>false</tt>.
148  \param kekulize whether the SMILES added to the shingling are kekulized.
149  Default: <tt>true</tt>.
150  \param min_radius the minimum radius that is used to extract n-grams.
151  Default: <tt>1</tt>.
152 
153  \returns the MHFP fingerprint.
154  */
155  std::vector<uint32_t>
156  Encode(ROMol& mol,
157  unsigned char radius = 3,
158  bool rings = true,
159  bool isomeric = false,
160  bool kekulize = true,
161  unsigned char min_radius = 1);
162 
163  //! \overload
164  std::vector<std::vector<uint32_t>>
165  Encode(std::vector<ROMol>& mols,
166  unsigned char radius = 3,
167  bool rings = true,
168  bool isomeric = false,
169  bool kekulize = true,
170  unsigned char min_radius = 1);
171 
172  //! \overload
173  std::vector<uint32_t>
174  Encode(std::string& smiles,
175  unsigned char radius = 3,
176  bool rings = true,
177  bool isomeric = false,
178  bool kekulize = true,
179  unsigned char min_radius = 1);
180 
181  //! \overload
182  std::vector<std::vector<uint32_t>>
183  Encode(std::vector<std::string>& smiles,
184  unsigned char radius = 3,
185  bool rings = true,
186  bool isomeric = false,
187  bool kekulize = true,
188  unsigned char min_radius = 1);
189 
190 
191  /*!
192  \brief Creates a binary fingerprint based on circular sub-SMILES.
193 
194  Creates a binary fingerprint similar to ECFP. However, instead of using
195  a Morgan-style hashing, circular n-grams (sub-SMILES) are created, hashed
196  directly and folded.
197 
198  \param radius the maximum radius of the substructure that is generated at
199  each atom. Default: <tt>3</tt>.
200  \param rings whether the rings (SSSR) are extrected from the molecule and
201  added to the shingling. Given the molecule
202  <tt>"C1CCCCCC1C(=O)C"</tt>, "<tt>C1CCCCCC1"</tt> would be added
203  to the shingling. Default: <tt>true</tt>.
204  \param isomeric whether the SMILES added to the shingling are isomeric.
205  Default: <tt>false</tt>.
206  \param kekulize whether the SMILES added to the shingling are kekulized.
207  Default: <tt>true</tt>.
208  \param min_radius the minimum radius that is used to extract n-grams.
209  Default: <tt>1</tt>.
210  \param length the length into which the fingerprint is folded.
211  Default: <tt>2048</tt>.
212 
213  \returns the SECFP fingerprint.
214  */
217  unsigned char radius = 3,
218  bool rings = true,
219  bool isomeric = false,
220  bool kekulize = true,
221  unsigned char min_radius = 1,
222  size_t length = 2048);
223 
224  //! \overload
225  std::vector<ExplicitBitVect>
226  EncodeSECFP(std::vector<ROMol>& mols,
227  unsigned char radius = 3,
228  bool rings = true,
229  bool isomeric = false,
230  bool kekulize = true,
231  unsigned char min_radius = 1,
232  size_t length = 2048);
233 
234  //! \overload
236  EncodeSECFP(std::string& smiles,
237  unsigned char radius = 3,
238  bool rings = true,
239  bool isomeric = false,
240  bool kekulize = true,
241  unsigned char min_radius = 1,
242  size_t length = 2048);
243 
244  //! \overload
245  std::vector<ExplicitBitVect>
246  EncodeSECFP(std::vector<std::string>& smiles,
247  unsigned char radius = 3,
248  bool rings = true,
249  bool isomeric = false,
250  bool kekulize = true,
251  unsigned char min_radius = 1,
252  size_t length = 2048);
253 
254  /*!
255  \brief Calculates the Jaccard / Tanimoto distance between two MHFP fingerprints.
256 
257  \param a an MHFP fingerprint vector.
258  \param b an MHFP fingerprint vector.
259 
260  \returns the Jaccard / Tanimoto distance between the two fingerprints.
261  */
262  static double
263  Distance(const std::vector<uint32_t>& a,
264  const std::vector<uint32_t>& b) {
265  size_t matches = 0;
266 
267  for (size_t i = 0; i < a.size(); i++)
268  if (a[i] == b[i])
269  matches++;
270 
271  return matches / (double)a.size();
272  }
273 
274 private:
275  //! The fastest mod implementation.
276  uint64_t
277  FastMod(const uint64_t input, const uint64_t ceil) {
278  return input >= ceil ? input % ceil : input;
279  }
280 
282  Fold(const std::vector<uint32_t>& vec, uint32_t length = 2048) {
283  ExplicitBitVect ebv(length);
284  for (size_t i = 0; i < vec.size(); i++)
285  ebv.setBit(vec[i] % length);
286  return ebv;
287  }
288 
289  std::vector<uint32_t>
290  HashShingling(std::vector<std::string> vec) {
291  std::vector<uint32_t> result(vec.size());
292  for (size_t i = 0; i < vec.size(); i++)
293  result[i] = FNV::hash(vec[i]);
294  return result;
295  }
296 
297  unsigned int n_permutations_, seed_;
298  uint64_t prime_ = 2305843009213693951UL;
299  uint32_t max_hash_ = 4294967295;
300  std::vector<uint32_t> perms_a_;
301  std::vector<uint32_t> perms_b_;
302 };
303 
304 } // namespace MHFPFingerprints
305 } // namespace RDKit
306 
307 #endif
RDKit::MHFPFingerprints::MHFPEncoder::EncodeSECFP
ExplicitBitVect EncodeSECFP(std::string &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=true, unsigned char min_radius=1, size_t length=2048)
This is an overloaded member function, provided for convenience. It differs from the above function o...
RDKit::MHFPFingerprints::MHFPEncoder::CreateShingling
std::vector< std::string > CreateShingling(const ROMol &mol, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=true, unsigned char min_radius=1)
Creates a molecular shingling based on circular substructures.
ROMol.h
Defines the primary molecule class ROMol as well as associated typedefs.
RDKit::MHFPFingerprints::FNV::prime
const uint32_t prime
Definition: MHFP.h:28
RDKit::MHFPFingerprints::MHFPEncoder::Encode
std::vector< std::vector< uint32_t > > Encode(std::vector< std::string > &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=true, unsigned char min_radius=1)
This is an overloaded member function, provided for convenience. It differs from the above function o...
RDKit::MHFPFingerprints::MHFPEncoder::Distance
static double Distance(const std::vector< uint32_t > &a, const std::vector< uint32_t > &b)
Calculates the Jaccard / Tanimoto distance between two MHFP fingerprints.
Definition: MHFP.h:263
RDKit::MHFPFingerprints::MHFPEncoder::Encode
std::vector< uint32_t > Encode(std::string &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=true, unsigned char min_radius=1)
This is an overloaded member function, provided for convenience. It differs from the above function o...
RDKit::MHFPFingerprints::MHFPEncoder::EncodeSECFP
std::vector< ExplicitBitVect > EncodeSECFP(std::vector< std::string > &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=true, unsigned char min_radius=1, size_t length=2048)
This is an overloaded member function, provided for convenience. It differs from the above function o...
ExplicitBitVect.h
RDKit::MHFPFingerprints::MHFPEncoder::MHFPEncoder
MHFPEncoder(unsigned int n_permutations=2048, unsigned int seed=42)
Constructor.
RDKit::MHFPFingerprints::FNV::seed
const uint32_t seed
Definition: MHFP.h:29
RDKit::MHFPFingerprints::MHFPEncoder::FromStringArray
std::vector< uint32_t > FromStringArray(const std::vector< std::string > &vec)
Creates a MinHash from a vector of strings.
RDKit::ROMol
Definition: ROMol.h:171
RDKit::MHFPFingerprints::MHFPEncoder::Encode
std::vector< uint32_t > Encode(ROMol &mol, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=true, unsigned char min_radius=1)
Creates a MinHash vector from a molecule.
RDKit::MHFPFingerprints::MHFPEncoder::Encode
std::vector< std::vector< uint32_t > > Encode(std::vector< ROMol > &mols, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=true, unsigned char min_radius=1)
This is an overloaded member function, provided for convenience. It differs from the above function o...
RDKit::MHFPFingerprints::MHFPEncoder::EncodeSECFP
ExplicitBitVect EncodeSECFP(ROMol &mol, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=true, unsigned char min_radius=1, size_t length=2048)
Creates a binary fingerprint based on circular sub-SMILES.
RDKit
Std stuff.
Definition: Atom.h:30
SparseBitVect
a class for bit vectors that are sparsely occupied.
Definition: SparseBitVect.h:34
RDKit::MHFPFingerprints::FNV::hash
uint32_t hash(const std::string &str, uint32_t hash=seed)
A simple implementation of the Fowler–Noll–Vo hash function.
Definition: MHFP.h:33
RDKit::MHFPFingerprints::MHFPEncoder
Definition: MHFP.h:44
RDKIT_FINGERPRINTS_EXPORT
#define RDKIT_FINGERPRINTS_EXPORT
Definition: export.h:242
RDKit::MHFPFingerprints::MHFPEncoder::CreateShingling
std::vector< std::string > CreateShingling(const std::string &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=true, unsigned char min_radius=1)
This is an overloaded member function, provided for convenience. It differs from the above function o...
RDKit::MHFPFingerprints::MHFPEncoder::FromArray
std::vector< uint32_t > FromArray(const std::vector< uint32_t > &vec)
Creates a MinHash from a list of unsigned integers.
RDKit::MHFPFingerprints::mhfpFingerprintVersion
const std::string mhfpFingerprintVersion
Definition: MHFP.h:25
ExplicitBitVect
a class for bit vectors that are densely occupied
Definition: ExplicitBitVect.h:29
RDKit::MHFPFingerprints::MHFPEncoder::EncodeSECFP
std::vector< ExplicitBitVect > EncodeSECFP(std::vector< ROMol > &mols, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=true, unsigned char min_radius=1, size_t length=2048)
This is an overloaded member function, provided for convenience. It differs from the above function o...
export.h