/email_input.py

https://github.com/JRMeyer/tensorflow-tutorial
Python | 162 lines | 100 code | 24 blank | 38 comment | 17 complexity | 682e08818849041b7a20fe059545de41 MD5 | raw file
  1. import glob
  2. import random
  3. import re
  4. from collections import Counter
  5. import numpy as np
  6. class DocReader():
  7. def __init__(self):
  8. pass
  9. def create_bag_of_words(self,filePaths):
  10. '''
  11. Input:
  12. filePaths: Array. A list of absolute filepaths
  13. Returns:
  14. bagOfWords: Array. All tokens in files
  15. '''
  16. bagOfWords = []
  17. regex = re.compile("X-Spam.*\n")
  18. for filePath in filePaths:
  19. with open(filePath, encoding ="latin-1") as f:
  20. raw = f.read()
  21. raw = re.sub(regex,'',raw)
  22. tokens = raw.split()
  23. for token in tokens:
  24. bagOfWords.append(token)
  25. return bagOfWords
  26. def get_feature_matrix(self,filePaths, featureDict):
  27. '''
  28. create feature/x matrix from multiple text files
  29. rows = files, cols = features
  30. '''
  31. featureMatrix = np.zeros(shape=(len(filePaths),
  32. len(featureDict)),
  33. dtype=float)
  34. regex = re.compile("X-Spam.*\n")
  35. for i,filePath in enumerate(filePaths):
  36. with open(filePath, encoding ="latin-1") as f:
  37. _raw = f.read()
  38. raw = re.sub(regex,'',_raw)
  39. tokens = raw.split()
  40. fileUniDist = Counter(tokens)
  41. for key,value in fileUniDist.items():
  42. if key in featureDict:
  43. featureMatrix[i,featureDict[key]] = value
  44. return featureMatrix
  45. def regularize_vectors(self,featureMatrix):
  46. '''
  47. Input:
  48. featureMatrix: matrix, where docs are rows and features are columns
  49. Returns:
  50. featureMatrix: matrix, updated by dividing each feature value by the total
  51. number of features for a given document
  52. '''
  53. for doc in range(featureMatrix.shape[0]):
  54. totalWords = np.sum(featureMatrix[doc,:],axis=0)
  55. featureMatrix[doc,:] = np.multiply(featureMatrix[doc,:],(1/totalWords))
  56. return featureMatrix
  57. def input_data(self,hamDir,spamDir,percentTest,cutoff):
  58. '''
  59. Input:
  60. hamDir: String. dir of ham text files
  61. spamDir: String. dir of spam text file
  62. percentTest: Float. percentage of all data to be assigned to testset
  63. Returns:
  64. trainPaths: Array. Absolute paths to training emails
  65. trainY: Array. Training labels, 0 or 1 int.
  66. testPaths: Array. Absolute paths to testing emails
  67. testY: Array. Testing labels, 0 or 1 int.
  68. '''
  69. pathLabelPairs={}
  70. for hamPath in glob.glob(hamDir+'*'):
  71. pathLabelPairs.update({hamPath:(0,1)})
  72. for spamPath in glob.glob(spamDir+'*'):
  73. pathLabelPairs.update({spamPath:(1,0)})
  74. # get test set as random subsample of all data
  75. numTest = int(percentTest * len(pathLabelPairs))
  76. testing = set(random.sample(pathLabelPairs.items(),numTest))
  77. # delete testing data from superset of all data
  78. for entry in testing:
  79. del pathLabelPairs[entry[0]]
  80. # split training tuples of (path,label) into separate lists
  81. trainPaths=[]
  82. trainY=[]
  83. for item in pathLabelPairs.items():
  84. trainPaths.append(item[0])
  85. trainY.append(item[1])
  86. # split testing tuples of (path,label) into separate lists
  87. testPaths=[]
  88. testY=[]
  89. for item in testing:
  90. testPaths.append(item[0])
  91. testY.append(item[1])
  92. # create feature dictionary of n-grams
  93. bagOfWords = self.create_bag_of_words(trainPaths)
  94. # throw out low freq words
  95. freqDist = Counter(bagOfWords)
  96. newBagOfWords=[]
  97. for word,freq in freqDist.items():
  98. if freq > cutoff:
  99. newBagOfWords.append(word)
  100. features = set(newBagOfWords)
  101. featureDict = {feature:i for i,feature in enumerate(features)}
  102. # make feature matrices
  103. trainX = self.get_feature_matrix(trainPaths,featureDict)
  104. testX = self.get_feature_matrix(testPaths,featureDict)
  105. # regularize length
  106. trainX = self.regularize_vectors(trainX)
  107. testX = self.regularize_vectors(testX)
  108. # cast as ndarrays
  109. trainY = np.asarray(trainY)
  110. testY = np.asarray(testY)
  111. return trainX, trainY, testX, testY
  112. def parse_user_args():
  113. parser = argparse.ArgumentParser()
  114. parser.add_argument('-ham','--hamDir')
  115. parser.add_argument('-spam','--spamDir')
  116. args = parser.parse_args()
  117. return args
  118. if __name__ == '__main__':
  119. import sys, argparse
  120. # get user input
  121. args = parse_user_args()
  122. hamDir = args.hamDir
  123. spamDir= args.spamDir
  124. reader = DocReader()
  125. trainX,trainY,testX,testY = reader.input_data(hamDir=hamDir,
  126. spamDir=spamDir,
  127. percentTest=.1,
  128. cutoff=15)
  129. print(trainX.shape)
  130. print(trainY.shape)
  131. print(testX.shape)
  132. print(testY.shape)
  133. np.savetxt("trainX.csv", trainX, delimiter="\t")
  134. np.savetxt("trainY.csv", trainY, delimiter="\t")
  135. np.savetxt("testX.csv", testX, delimiter="\t")
  136. np.savetxt("testY.csv", testY, delimiter="\t")
  137. print(trainX[:10,:])
  138. print(trainY[:10,:])