PageRenderTime 36ms CodeModel.GetById 9ms RepoModel.GetById 0ms app.codeStats 0ms

/Python/Templates/Samples/ProjectTemplates/Python/Machine Learning/ClassifierTemplate/classifier.py

https://gitlab.com/SplatoonModdingHub/PTVS
Python | 290 lines | 267 code | 0 blank | 23 comment | 0 complexity | 600e69c56cce78a4037d6669c5dd0e15 MD5 | raw file
  1. '''
  2. This script perfoms the basic process for applying a machine learning
  3. algorithm to a dataset using Python libraries.
  4. The four steps are:
  5. 1. Download a dataset (using pandas)
  6. 2. Process the numeric data (using numpy)
  7. 3. Train and evaluate learners (using scikit-learn)
  8. 4. Plot and compare results (using matplotlib)
  9. The data is downloaded from URL, which is defined below. As is normal
  10. for machine learning problems, the nature of the source data affects
  11. the entire solution. When you change URL to refer to your own data, you
  12. will need to review the data processing steps to ensure they remain
  13. correct.
  14. ============
  15. Example Data
  16. ============
  17. The example is from http://archive.ics.uci.edu/ml/datasets/Spambase
  18. It contains pre-processed metrics, such as the frequency of certain
  19. words and letters, from a collection of emails. A classification for
  20. each one indicating 'spam' or 'not spam' is in the final column.
  21. See the linked page for full details of the data set.
  22. This script uses three classifiers to predict the class of an email
  23. based on the metrics. These are not representative of modern spam
  24. detection systems.
  25. '''
  26. # Remember to update the script for the new data when you change this URL
  27. URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
  28. # Uncomment this call when using matplotlib to generate images
  29. # rather than displaying interactive UI.
  30. #import matplotlib
  31. #matplotlib.use('Agg')
  32. from pandas import read_table
  33. import numpy as np
  34. import matplotlib.pyplot as plt
  35. try:
  36. # [OPTIONAL] Seaborn makes plots nicer
  37. import seaborn
  38. except ImportError:
  39. pass
  40. # =====================================================================
  41. def download_data():
  42. '''
  43. Downloads the data for this script into a pandas DataFrame.
  44. '''
  45. # If your data is in an Excel file, install 'xlrd' and use
  46. # pandas.read_excel instead of read_table
  47. #from pandas import read_excel
  48. #frame = read_excel(URL)
  49. # If your data is in a private Azure blob, install 'azure' and use
  50. # BlobService.get_blob_to_path() with read_table() or read_excel()
  51. #import azure.storage
  52. #service = azure.storage.BlobService(ACCOUNT_NAME, ACCOUNT_KEY)
  53. #service.get_blob_to_path(container_name, blob_name, 'my_data.csv')
  54. #frame = read_table('my_data.csv', ...
  55. frame = read_table(
  56. URL,
  57. # Uncomment if the file needs to be decompressed
  58. #compression='gzip',
  59. #compression='bz2',
  60. # Specify the file encoding
  61. # Latin-1 is common for data from US sources
  62. encoding='latin-1',
  63. #encoding='utf-8', # UTF-8 is also common
  64. # Specify the separator in the data
  65. sep=',', # comma separated values
  66. #sep='\t', # tab separated values
  67. #sep=' ', # space separated values
  68. # Ignore spaces after the separator
  69. skipinitialspace=True,
  70. # Generate row labels from each row number
  71. index_col=None,
  72. #index_col=0, # use the first column as row labels
  73. #index_col=-1, # use the last column as row labels
  74. # Generate column headers row from each column number
  75. header=None,
  76. #header=0, # use the first line as headers
  77. # Use manual headers and skip the first row in the file
  78. #header=0,
  79. #names=['col1', 'col2', ...],
  80. )
  81. # Return a subset of the columns
  82. #return frame[['col1', 'col4', ...]]
  83. # Return the entire frame
  84. return frame
  85. # =====================================================================
  86. def get_features_and_labels(frame):
  87. '''
  88. Transforms and scales the input data and returns numpy arrays for
  89. training and testing inputs and targets.
  90. '''
  91. # Replace missing values with 0.0, or we can use
  92. # scikit-learn to calculate missing values (below)
  93. #frame[frame.isnull()] = 0.0
  94. # Convert values to floats
  95. arr = np.array(frame, dtype=np.float)
  96. # Use the last column as the target value
  97. X, y = arr[:, :-1], arr[:, -1]
  98. # To use the first column instead, change the index value
  99. #X, y = arr[:, 1:], arr[:, 0]
  100. # Use 80% of the data for training; test against the rest
  101. from sklearn.cross_validation import train_test_split
  102. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
  103. # sklearn.pipeline.make_pipeline could also be used to chain
  104. # processing and classification into a black box, but here we do
  105. # them separately.
  106. # If values are missing we could impute them from the training data
  107. #from sklearn.preprocessing import Imputer
  108. #imputer = Imputer(strategy='mean')
  109. #imputer.fit(X_train)
  110. #X_train = imputer.transform(X_train)
  111. #X_test = imputer.transform(X_test)
  112. # Normalize the attribute values to mean=0 and variance=1
  113. from sklearn.preprocessing import StandardScaler
  114. scaler = StandardScaler()
  115. # To scale to a specified range, use MinMaxScaler
  116. #from sklearn.preprocessing import MinMaxScaler
  117. #scaler = MinMaxScaler(feature_range=(0, 1))
  118. # Fit the scaler based on the training data, then apply the same
  119. # scaling to both training and test sets.
  120. scaler.fit(X_train)
  121. X_train = scaler.transform(X_train)
  122. X_test = scaler.transform(X_test)
  123. # Return the training and test sets
  124. return X_train, X_test, y_train, y_test
  125. # =====================================================================
  126. def evaluate_classifier(X_train, X_test, y_train, y_test):
  127. '''
  128. Run multiple times with different classifiers to get an idea of the
  129. relative performance of each configuration.
  130. Returns a sequence of tuples containing:
  131. (title, precision, recall)
  132. for each learner.
  133. '''
  134. # Import some classifiers to test
  135. from sklearn.svm import LinearSVC, NuSVC
  136. from sklearn.ensemble import AdaBoostClassifier
  137. # We will calculate the P-R curve for each classifier
  138. from sklearn.metrics import precision_recall_curve, f1_score
  139. # Here we create classifiers with default parameters. These need
  140. # to be adjusted to obtain optimal performance on your data set.
  141. # Test the linear support vector classifier
  142. classifier = LinearSVC(C=1)
  143. # Fit the classifier
  144. classifier.fit(X_train, y_train)
  145. score = f1_score(y_test, classifier.predict(X_test))
  146. # Generate the P-R curve
  147. y_prob = classifier.decision_function(X_test)
  148. precision, recall, _ = precision_recall_curve(y_test, y_prob)
  149. # Include the score in the title
  150. yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall
  151. # Test the Nu support vector classifier
  152. classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3)
  153. # Fit the classifier
  154. classifier.fit(X_train, y_train)
  155. score = f1_score(y_test, classifier.predict(X_test))
  156. # Generate the P-R curve
  157. y_prob = classifier.decision_function(X_test)
  158. precision, recall, _ = precision_recall_curve(y_test, y_prob)
  159. # Include the score in the title
  160. yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall
  161. # Test the Ada boost classifier
  162. classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R')
  163. # Fit the classifier
  164. classifier.fit(X_train, y_train)
  165. score = f1_score(y_test, classifier.predict(X_test))
  166. # Generate the P-R curve
  167. y_prob = classifier.decision_function(X_test)
  168. precision, recall, _ = precision_recall_curve(y_test, y_prob)
  169. # Include the score in the title
  170. yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall
  171. # =====================================================================
  172. def plot(results):
  173. '''
  174. Create a plot comparing multiple learners.
  175. `results` is a list of tuples containing:
  176. (title, precision, recall)
  177. All the elements in results will be plotted.
  178. '''
  179. # Plot the precision-recall curves
  180. fig = plt.figure(figsize=(6, 6))
  181. fig.canvas.set_window_title('Classifying data from ' + URL)
  182. for label, precision, recall in results:
  183. plt.plot(recall, precision, label=label)
  184. plt.title('Precision-Recall Curves')
  185. plt.xlabel('Precision')
  186. plt.ylabel('Recall')
  187. plt.legend(loc='lower left')
  188. # Let matplotlib improve the layout
  189. plt.tight_layout()
  190. # ==================================
  191. # Display the plot in interactive UI
  192. plt.show()
  193. # To save the plot to an image file, use savefig()
  194. #plt.savefig('plot.png')
  195. # Open the image file with the default image viewer
  196. #import subprocess
  197. #subprocess.Popen('plot.png', shell=True)
  198. # To save the plot to an image in memory, use BytesIO and savefig()
  199. # This can then be written to any stream-like object, such as a
  200. # file or HTTP response.
  201. #from io import BytesIO
  202. #img_stream = BytesIO()
  203. #plt.savefig(img_stream, fmt='png')
  204. #img_bytes = img_stream.getvalue()
  205. #print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...'))
  206. # Closing the figure allows matplotlib to release the memory used.
  207. plt.close()
  208. # =====================================================================
  209. if __name__ == '__main__':
  210. # Download the data set from URL
  211. print("Downloading data from {}".format(URL))
  212. frame = download_data()
  213. # Process data into feature and label arrays
  214. print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns)))
  215. X_train, X_test, y_train, y_test = get_features_and_labels(frame)
  216. # Evaluate multiple classifiers on the data
  217. print("Evaluating classifiers")
  218. results = list(evaluate_classifier(X_train, X_test, y_train, y_test))
  219. # Display the results
  220. print("Plotting the results")
  221. plot(results)