PageRenderTime 48ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/Python/Templates/Samples/ProjectTemplates/Python/Machine Learning/RegressionTemplate/regression.py

https://gitlab.com/SplatoonModdingHub/PTVS
Python | 303 lines | 81 code | 65 blank | 157 comment | 4 complexity | 05f20cbcea51db042ffdbe40f3077967 MD5 | raw file
  1. '''
  2. This script perfoms the basic process for applying a machine learning
  3. algorithm to a dataset using Python libraries.
  4. The four steps are:
  5. 1. Download a dataset (using pandas)
  6. 2. Process the numeric data (using numpy)
  7. 3. Train and evaluate learners (using scikit-learn)
  8. 4. Plot and compare results (using matplotlib)
  9. The data is downloaded from URL, which is defined below. As is normal
  10. for machine learning problems, the nature of the source data affects
  11. the entire solution. When you change URL to refer to your own data, you
  12. will need to review the data processing steps to ensure they remain
  13. correct.
  14. ============
  15. Example Data
  16. ============
  17. The example is from http://mldata.org/repository/data/viewslug/stockvalues/
  18. It contains stock prices and the values of three indices for each day
  19. over a five year period. See the linked page for more details about
  20. this data set.
  21. This script uses regression learners to predict the stock price for
  22. the second half of this period based on the values of the indices. This
  23. is a naive approach, and a more robust method would use each prediction
  24. as an input for the next, and would predict relative rather than
  25. absolute values.
  26. '''
  27. # Remember to update the script for the new data when you change this URL
  28. URL = "http://mldata.org/repository/data/download/csv/stockvalues/"
  29. # This is the column of the sample data to predict.
  30. # Try changing it to other integers between 1 and 155.
  31. TARGET_COLUMN = 32
  32. # Uncomment this call when using matplotlib to generate images
  33. # rather than displaying interactive UI.
  34. #import matplotlib
  35. #matplotlib.use('Agg')
  36. from pandas import read_table
  37. import numpy as np
  38. import matplotlib.pyplot as plt
  39. try:
  40. # [OPTIONAL] Seaborn makes plots nicer
  41. import seaborn
  42. except ImportError:
  43. pass
  44. # =====================================================================
  45. def download_data():
  46. '''
  47. Downloads the data for this script into a pandas DataFrame.
  48. '''
  49. # If your data is in an Excel file, install 'xlrd' and use
  50. # pandas.read_excel instead of read_table
  51. #from pandas import read_excel
  52. #frame = read_excel(URL)
  53. # If your data is in a private Azure blob, install 'azure' and use
  54. # BlobService.get_blob_to_path() with read_table() or read_excel()
  55. #import azure.storage
  56. #service = azure.storage.BlobService(ACCOUNT_NAME, ACCOUNT_KEY)
  57. #service.get_blob_to_path(container_name, blob_name, 'my_data.csv')
  58. #frame = read_table('my_data.csv', ...
  59. frame = read_table(
  60. URL,
  61. # Uncomment if the file needs to be decompressed
  62. #compression='gzip',
  63. #compression='bz2',
  64. # Specify the file encoding
  65. # Latin-1 is common for data from US sources
  66. encoding='latin-1',
  67. #encoding='utf-8', # UTF-8 is also common
  68. # Specify the separator in the data
  69. sep=',', # comma separated values
  70. #sep='\t', # tab separated values
  71. #sep=' ', # space separated values
  72. # Ignore spaces after the separator
  73. skipinitialspace=True,
  74. # Generate row labels from each row number
  75. index_col=None,
  76. #index_col=0, # use the first column as row labels
  77. #index_col=-1, # use the last column as row labels
  78. # Generate column headers row from each column number
  79. header=None,
  80. #header=0, # use the first line as headers
  81. # Use manual headers and skip the first row in the file
  82. #header=0,
  83. #names=['col1', 'col2', ...],
  84. )
  85. # Return the entire frame
  86. #return frame
  87. # Return a subset of the columns
  88. return frame[[156, 157, 158, TARGET_COLUMN]]
  89. # =====================================================================
  90. def get_features_and_labels(frame):
  91. '''
  92. Transforms and scales the input data and returns numpy arrays for
  93. training and testing inputs and targets.
  94. '''
  95. # Replace missing values with 0.0
  96. # or we can use scikit-learn to calculate missing values below
  97. #frame[frame.isnull()] = 0.0
  98. # Convert values to floats
  99. arr = np.array(frame, dtype=np.float)
  100. # Normalize the entire data set
  101. from sklearn.preprocessing import StandardScaler, MinMaxScaler
  102. arr = MinMaxScaler().fit_transform(arr)
  103. # Use the last column as the target value
  104. X, y = arr[:, :-1], arr[:, -1]
  105. # To use the first column instead, change the index value
  106. #X, y = arr[:, 1:], arr[:, 0]
  107. # Use 50% of the data for training, but we will test against the
  108. # entire set
  109. from sklearn.cross_validation import train_test_split
  110. X_train, _, y_train, _ = train_test_split(X, y, test_size=0.5)
  111. X_test, y_test = X, y
  112. # If values are missing we could impute them from the training data
  113. #from sklearn.preprocessing import Imputer
  114. #imputer = Imputer(strategy='mean')
  115. #imputer.fit(X_train)
  116. #X_train = imputer.transform(X_train)
  117. #X_test = imputer.transform(X_test)
  118. # Normalize the attribute values to mean=0 and variance=1
  119. from sklearn.preprocessing import StandardScaler
  120. scaler = StandardScaler()
  121. # To scale to a specified range, use MinMaxScaler
  122. #from sklearn.preprocessing import MinMaxScaler
  123. #scaler = MinMaxScaler(feature_range=(0, 1))
  124. # Fit the scaler based on the training data, then apply the same
  125. # scaling to both training and test sets.
  126. scaler.fit(X_train)
  127. X_train = scaler.transform(X_train)
  128. X_test = scaler.transform(X_test)
  129. # Return the training and test sets
  130. return X_train, X_test, y_train, y_test
  131. # =====================================================================
  132. def evaluate_learner(X_train, X_test, y_train, y_test):
  133. '''
  134. Run multiple times with different algorithms to get an idea of the
  135. relative performance of each configuration.
  136. Returns a sequence of tuples containing:
  137. (title, expected values, actual values)
  138. for each learner.
  139. '''
  140. # Use a support vector machine for regression
  141. from sklearn.svm import SVR
  142. # Train using a radial basis function
  143. svr = SVR(kernel='rbf', gamma=0.1)
  144. svr.fit(X_train, y_train)
  145. y_pred = svr.predict(X_test)
  146. r_2 = svr.score(X_test, y_test)
  147. yield 'RBF Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred
  148. # Train using a linear kernel
  149. svr = SVR(kernel='linear')
  150. svr.fit(X_train, y_train)
  151. y_pred = svr.predict(X_test)
  152. r_2 = svr.score(X_test, y_test)
  153. yield 'Linear Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred
  154. # Train using a polynomial kernel
  155. svr = SVR(kernel='poly', degree=2)
  156. svr.fit(X_train, y_train)
  157. y_pred = svr.predict(X_test)
  158. r_2 = svr.score(X_test, y_test)
  159. yield 'Polynomial Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred
  160. # =====================================================================
  161. def plot(results):
  162. '''
  163. Create a plot comparing multiple learners.
  164. `results` is a list of tuples containing:
  165. (title, expected values, actual values)
  166. All the elements in results will be plotted.
  167. '''
  168. # Using subplots to display the results on the same X axis
  169. fig, plts = plt.subplots(nrows=len(results), figsize=(8, 8))
  170. fig.canvas.set_window_title('Predicting data from ' + URL)
  171. # Show each element in the plots returned from plt.subplots()
  172. for subplot, (title, y, y_pred) in zip(plts, results):
  173. # Configure each subplot to have no tick marks
  174. # (these are meaningless for the sample dataset)
  175. subplot.set_xticklabels(())
  176. subplot.set_yticklabels(())
  177. # Label the vertical axis
  178. subplot.set_ylabel('stock price')
  179. # Set the title for the subplot
  180. subplot.set_title(title)
  181. # Plot the actual data and the prediction
  182. subplot.plot(y, 'b', label='actual')
  183. subplot.plot(y_pred, 'r', label='predicted')
  184. # Shade the area between the predicted and the actual values
  185. subplot.fill_between(
  186. # Generate X values [0, 1, 2, ..., len(y)-2, len(y)-1]
  187. np.arange(0, len(y), 1),
  188. y,
  189. y_pred,
  190. color='r',
  191. alpha=0.2
  192. )
  193. # Mark the extent of the training data
  194. subplot.axvline(len(y) // 2, linestyle='--', color='0', alpha=0.2)
  195. # Include a legend in each subplot
  196. subplot.legend()
  197. # Let matplotlib handle the subplot layout
  198. fig.tight_layout()
  199. # ==================================
  200. # Display the plot in interactive UI
  201. plt.show()
  202. # To save the plot to an image file, use savefig()
  203. #plt.savefig('plot.png')
  204. # Open the image file with the default image viewer
  205. #import subprocess
  206. #subprocess.Popen('plot.png', shell=True)
  207. # To save the plot to an image in memory, use BytesIO and savefig()
  208. # This can then be written to any stream-like object, such as a
  209. # file or HTTP response.
  210. #from io import BytesIO
  211. #img_stream = BytesIO()
  212. #plt.savefig(img_stream, fmt='png')
  213. #img_bytes = img_stream.getvalue()
  214. #print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...'))
  215. # Closing the figure allows matplotlib to release the memory used.
  216. plt.close()
  217. # =====================================================================
  218. if __name__ == '__main__':
  219. # Download the data set from URL
  220. print("Downloading data from {}".format(URL))
  221. frame = download_data()
  222. # Process data into feature and label arrays
  223. print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns)))
  224. X_train, X_test, y_train, y_test = get_features_and_labels(frame)
  225. # Evaluate multiple regression learners on the data
  226. print("Evaluating regression learners")
  227. results = list(evaluate_learner(X_train, X_test, y_train, y_test))
  228. # Display the results
  229. print("Plotting the results")
  230. plot(results)