PageRenderTime 284ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/TextAnalyzer.py

https://gitlab.com/vicidroiddev/SouthParkMachineLearning
Python | 302 lines | 192 code | 65 blank | 45 comment | 22 complexity | 576f8c2f13ab0f432bf62f574bf622f5 MD5 | raw file
  1. from sklearn.externals import joblib
  2. from Utils import Utils
  3. __author__ = 'Raphael'
  4. import logging
  5. import pandas
  6. import numpy as np
  7. import time
  8. import os
  9. import multiprocessing
  10. import matplotlib.pyplot as plt
  11. from Benchmark import Benchmark
  12. from DataSanitzer import DataSanitizer
  13. from Dataset import Dataset
  14. from sklearn.pipeline import Pipeline
  15. import sknn.mlp
  16. from sknn.backend import lasagne
  17. from sklearn.feature_extraction import DictVectorizer
  18. from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
  19. from sklearn.ensemble import RandomForestClassifier
  20. from sklearn.cross_validation import train_test_split
  21. from sklearn.naive_bayes import MultinomialNB
  22. from sklearn.linear_model import SGDClassifier
  23. from sklearn.grid_search import GridSearchCV
  24. from gensim.models import Word2Vec
  25. logging.basicConfig()
  26. logger = logging.getLogger('TextAnalzer')
  27. logger.setLevel(logging.INFO)
  28. # REVIEW_DATA = './data/Season-1.csv'
  29. REVIEW_DATA = './data/All-seasons.csv'
  30. LINE = 'Line'
  31. SEASON = 'Season'
  32. EPISODE = 'Episode'
  33. CHARACTER = 'Character'
  34. CHARACTER_PREDICTION = 'Character Prediction'
  35. DEBUG = True
  36. N_FOLDS = 10
  37. NR_FOREST_ESTIMATORS = 100
  38. MAX_FEATURES = 50000
  39. TEST_RATIO = 0.25
  40. X_TRAINING = 'training_features'
  41. Y_TRAINING = 'training_labels'
  42. X_TESTING = 'testing_features'
  43. Y_TESTING = 'testing_labels'
  44. X_TRAINING_BAG_OF_WORDS = 'features_bagofwords'
  45. X_TESTING_BAG_OF_WORDS = 'testing_features_bagofwords'
  46. GENSIM_MODEL = 'gen_sim_model'
  47. NN_LEARNING_RATE = 0.002
  48. SAVE_MODEL = False
  49. NAIVE_BAYES = 'Naive_Bayes'
  50. SVM_SGD = 'SVM_SGD'
  51. RANDOM_FOREST = 'Random_Forests'
  52. NEURAL_NETWORK = 'Neural_Network_MLP'
  53. class TextAnalyzer:
  54. class Data:
  55. def __init__(self):
  56. pass
  57. def __init__(self, csvPath):
  58. self.csvPath = csvPath
  59. self.dataFrame = None
  60. self.slicedDF = None
  61. self.dataset = Dataset()
  62. def createDataFrame(self, csvPath=None, nameFilter=None):
  63. self.dataFrame = pandas.read_csv(
  64. self.csvPath if csvPath is None else csvPath,
  65. sep=',',
  66. header=0,
  67. skipinitialspace=True,
  68. quotechar='"'
  69. )
  70. # names = pandas.unique(self.dataFrame[CHARACTER].values)
  71. # Utils.printListItems(names)
  72. # if True:
  73. # self.dataFrame = self.dataFrame[(self.dataFrame.Character.isin(nameFilter))]
  74. if nameFilter:
  75. self.dataFrame[CHARACTER] = [self.applyNameFilter(x, nameFilter) for x in self.dataFrame.Character]
  76. self.dataset.X = self.dataFrame[[LINE]].values.ravel()
  77. self.dataset.Y = self.dataFrame[[CHARACTER]].values.ravel()
  78. def applyNameFilter(self, name, filter):
  79. return name if filter is None or name in filter else 'Other'
  80. # Filters out uneeded stuff and produces an array of words
  81. def cleanData(self):
  82. self.dataset.X_cleaned[:] = [DataSanitizer.filterWords(x) for x in self.dataset.X]
  83. def vectorizeData(self, scheme=None, resume=None):
  84. if scheme == 'word2vec':
  85. model = None
  86. if resume:
  87. model = Word2Vec.load(GENSIM_MODEL)
  88. if model is None:
  89. model = Word2Vec(
  90. self.dataset.X_cleaned,
  91. min_count=20,
  92. size=100,
  93. workers=multiprocessing.cpu_count()
  94. )
  95. model.save(GENSIM_MODEL)
  96. elif scheme == 'bagofwords':
  97. vec = CountVectorizer(
  98. analyzer="word",
  99. tokenizer=None,
  100. preprocessor=None,
  101. stop_words=None,
  102. max_features=MAX_FEATURES
  103. )
  104. self.dataset.X = vec.fit_transform(self.dataset.X_cleaned).toarray()
  105. # Normalize the frequencies with Tf-idf, this seems to shave off half the training time!
  106. def genTfIdf(self):
  107. transformer = TfidfTransformer(use_idf=True)
  108. transformer.fit_transform(self.dataset.X)
  109. self.dataset.X = transformer.transform(self.dataset.X)
  110. def splitData(self):
  111. X_train, X_test, Y_train, Y_test = train_test_split(
  112. self.dataset.X, # bag of words
  113. self.dataset.Y,
  114. test_size=TEST_RATIO,
  115. random_state=42
  116. )
  117. self.dataset.X_train = X_train
  118. self.dataset.Y_train = Y_train
  119. self.dataset.X_test = X_test
  120. self.dataset.Y_test = Y_test
  121. def doSVMwithGridSearch(self):
  122. # text_clf = Pipeline([(
  123. # 'clf',
  124. # SGDClassifier(shuffle=False, n_jobs=-1, n_iter=10, random_state=42)), ])
  125. parameters = {
  126. # 'seed': [0],
  127. 'loss': ('log', 'hinge'),
  128. 'penalty': ['l1', 'l2', 'elasticnet'],
  129. 'alpha': [0.001, 0.0001, 0.00001, 0.000001]
  130. }
  131. classifier = GridSearchCV(SGDClassifier(), parameters, n_jobs=-1)
  132. return classifier
  133. # self.svmClf.fit(self.dataset.X_train, self.dataset.Y_train)
  134. # predicted = self.svmClf.predict(self.dataset.X_test)
  135. # self.saveResults(predicted, 'SVM new')
  136. def classifyData(self, algo=None, saveModel=False):
  137. bench = Benchmark()
  138. classifier = None
  139. prediction = None
  140. if algo == SVM_SGD:
  141. classifier = SGDClassifier(n_jobs=-1, loss='hinge', penalty='l2', alpha=1e-5, n_iter=50, random_state=42)
  142. # classifier = self.doSVMwithGridSearch()
  143. elif algo == NEURAL_NETWORK:
  144. classifier = sknn.mlp.Classifier(
  145. layers=[ # Sigmoid, Tanh, Rectifier, Softmax, Linear
  146. # sknn.mlp.Layer("Tanh", units=300),
  147. (sknn.mlp.Layer("Linear", units=300) for i in range(2)),
  148. sknn.mlp.Layer("Softmax"),
  149. ],
  150. learning_rate=NN_LEARNING_RATE,
  151. n_iter=10,
  152. learning_momentum=.9,
  153. debug=False,
  154. regularize=None, # L1, L2, dropout, and batch normalization.
  155. learning_rule='sgd' # sgd, momentum, nesterov, adadelta, adagrad, rmsprop, adam
  156. )
  157. elif algo == RANDOM_FOREST:
  158. classifier = RandomForestClassifier(n_estimators=NR_FOREST_ESTIMATORS, n_jobs=-1)
  159. elif algo == NAIVE_BAYES:
  160. classifier = MultinomialNB()
  161. classifier.fit(self.dataset.X_train, self.dataset.Y_train)
  162. bench.end('Training Data using: ' + algo)
  163. # save that training model
  164. if saveModel:
  165. joblib.dump(classifier, './model/classifier_{}_{}'.format(algo, time.time()), compress=9)
  166. bench.end('Dumping Classifier Data')
  167. prediction = classifier.predict(self.dataset.X_test)
  168. score = classifier.score(self.dataset.X_test, self.dataset.Y_test)
  169. bench.end('Predicting Data using: ' + algo)
  170. if algo == NEURAL_NETWORK:
  171. prediction = [x[0] for x in prediction]
  172. self.saveResults(prediction, algo, score=score)
  173. # Convenience method for printing out a panda dataframe
  174. def printDataFrame(self, dataframe):
  175. with pandas.option_context('display.max_rows', 100, 'display.max_columns', 100):
  176. logger.info(dataframe)
  177. def vectorizeDict(self):
  178. self.dictVec = DictVectorizer(sparse=False)
  179. self.dictVec.fit_transform(self.dataset.X_train)
  180. # self.countVec = CountVectorizer()
  181. # self.countVec.fit_transform(self.xTrainingData)
  182. # print 'Done Vectorizing'
  183. # def kFoldIndices(self):
  184. # return KFold(n=self.dataFrame.shape[0], n_folds = N_FOLDS)
  185. # def saveStats(self):
  186. # frame = pandas.DataFrame(
  187. # data={
  188. # 'Max Features': [MAX_FEATURES, ],
  189. # 'Forest Estimators': [NR_FOREST_ESTIMATORS, ],
  190. # 'Accuracy': [self.accuracy]
  191. # }
  192. # )
  193. #
  194. # frame = frame.transpose()
  195. #
  196. # self.printDataFrame(frame)
  197. #
  198. # frame.to_csv("./results/Trail Stats {}".format(time.time()), index=True)
  199. #
  200. # # self.printDataFrame(frame)
  201. def saveResults(self, prediction, classifierName, **kwargs):
  202. output = pandas.DataFrame(
  203. data={
  204. # LINE: self.dataset.X_test_original,
  205. CHARACTER: self.dataset.Y_test,
  206. CHARACTER_PREDICTION: prediction
  207. }
  208. )
  209. output.to_csv("./results/{}.csv".format(classifierName), index=False)
  210. logger.info('Accuracy: {} %'.format(round(kwargs['score'] * 100, 3)))
  211. def optimizeParams(self):
  212. self.params = {
  213. 'vect__ngram_range': [(1, 1), (1, 2)],
  214. 'tfidf__use_idf': (True, False),
  215. 'clf__alpha': (1e-1, 1e-2, 1e-3),
  216. }
  217. if __name__ == '__main__':
  218. bench = Benchmark()
  219. anal = TextAnalyzer(REVIEW_DATA)
  220. bench.end('Initializing')
  221. anal.createDataFrame(nameFilter=['Kyle', 'Stan', 'Kenny', 'Cartman', 'Butters', 'Jimmy',
  222. 'Timmy'])
  223. bench.end('Reading CSV')
  224. anal.cleanData() # Prepare data in a format that is good for scikitlearn
  225. bench.end('Cleaning Data')
  226. anal.vectorizeData(scheme='bagofwords')
  227. bench.end('Generating Bag of Words Representation')
  228. anal.genTfIdf() #
  229. bench.end('Generating TF-IDF Representation')
  230. anal.splitData()
  231. bench.end('Generating Test and Training Data')
  232. anal.classifyData(NAIVE_BAYES, saveModel=SAVE_MODEL)
  233. # plt.scatter(anal.dataset.X_train, anal.dataset.Y_train, color='black')
  234. # plt.show
  235. # anal.optimizeParams()
  236. # anal.doSVM()
  237. # anal.saveStats()