Package rdkit :: Package ML :: Module files
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.files

  1  # copyright 2000 greg landrum 
  2  """ Generic file manipulation stuff 
  3   
  4  """ 
  5  from __future__ import print_function 
  6   
  7  import re 
  8   
  9  import numpy 
 10   
 11   
12 -class ReFile:
13 """convenience class for dealing with files with comments 14 15 blank (all whitespace) lines, and lines beginning with comment 16 characters are skipped. 17 18 anything following a comment character on a line is stripped off 19 """ 20
21 - def readline(self):
22 """ read the next line and return it. 23 24 return '' on EOF 25 26 """ 27 result = '' 28 while result == '': 29 inLine = self.inFile.readline() 30 if inLine == '': 31 return '' 32 result = self.regExp.split(inLine)[0].strip() 33 return result
34
35 - def readlines(self):
36 """ return a list of all the lines left in the file 37 38 return [] if there are none 39 40 """ 41 res = [] 42 inLines = self.inFile.readlines() 43 for line in inLines: 44 result = self.regExp.split(line)[0].strip() 45 if result != '': 46 res.append(result) 47 48 return res
49
50 - def rewind(self):
51 """ rewinds the file (seeks to the beginning) 52 53 """ 54 self.inFile.seek(0)
55
56 - def __init__(self, fileName, mode='r', comment=r'#', trailer=r'\n'):
57 if trailer is not None and trailer != '': 58 comment = comment + r'|' + trailer 59 self.regExp = re.compile(comment) 60 self.inFile = open(fileName, mode)
61 62
63 -def ReadDataFile(fileName, comment=r'#', depVarCol=0, dataType=numpy.float):
64 """ read in the data file and return a tuple of two Numeric arrays: 65 (independant variables, dependant variables). 66 67 **ARGUMENTS:** 68 69 - fileName: the fileName 70 71 - comment: the comment character for the file 72 73 - depVarcol: the column number containing the dependant variable 74 75 - dataType: the Numeric short-hand for the data type 76 77 RETURNS: 78 79 a tuple of two Numeric arrays: 80 81 (independant variables, dependant variables). 82 83 """ 84 inFile = ReFile(fileName) 85 dataLines = inFile.readlines() 86 nPts = len(dataLines) 87 88 if dataType in [numpy.float, numpy.float32, numpy.float64]: 89 _convfunc = float 90 else: 91 _convfunc = int 92 93 nIndVars = len(dataLines[0].split()) - 1 94 indVarMat = numpy.zeros((nPts, nIndVars), dataType) 95 depVarVect = numpy.zeros(nPts, dataType) 96 for i in range(nPts): 97 splitLine = dataLines[i].split() 98 depVarVect[i] = _convfunc(splitLine[depVarCol]) 99 del splitLine[depVarCol] 100 indVarMat[i, :] = map(_convfunc, splitLine) 101 102 return indVarMat, depVarVect
103 104 105 if __name__ == '__main__': 106 import sys 107 108 fileN = sys.argv[1] 109 iV, dV = ReadDataFile(fileN) 110 print('iV:', iV) 111 print('dV:', dV) 112