/Python/Templates/Samples/ProjectTemplates/Python/Machine Learning/ClassifierTemplate/classifier.py
Python | 290 lines | 267 code | 0 blank | 23 comment | 0 complexity | 600e69c56cce78a4037d6669c5dd0e15 MD5 | raw file
- '''
- This script perfoms the basic process for applying a machine learning
- algorithm to a dataset using Python libraries.
- The four steps are:
- 1. Download a dataset (using pandas)
- 2. Process the numeric data (using numpy)
- 3. Train and evaluate learners (using scikit-learn)
- 4. Plot and compare results (using matplotlib)
- The data is downloaded from URL, which is defined below. As is normal
- for machine learning problems, the nature of the source data affects
- the entire solution. When you change URL to refer to your own data, you
- will need to review the data processing steps to ensure they remain
- correct.
- ============
- Example Data
- ============
- The example is from http://archive.ics.uci.edu/ml/datasets/Spambase
- It contains pre-processed metrics, such as the frequency of certain
- words and letters, from a collection of emails. A classification for
- each one indicating 'spam' or 'not spam' is in the final column.
- See the linked page for full details of the data set.
- This script uses three classifiers to predict the class of an email
- based on the metrics. These are not representative of modern spam
- detection systems.
- '''
- # Remember to update the script for the new data when you change this URL
- URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
- # Uncomment this call when using matplotlib to generate images
- # rather than displaying interactive UI.
- #import matplotlib
- #matplotlib.use('Agg')
- from pandas import read_table
- import numpy as np
- import matplotlib.pyplot as plt
- try:
- # [OPTIONAL] Seaborn makes plots nicer
- import seaborn
- except ImportError:
- pass
- # =====================================================================
- def download_data():
- '''
- Downloads the data for this script into a pandas DataFrame.
- '''
- # If your data is in an Excel file, install 'xlrd' and use
- # pandas.read_excel instead of read_table
- #from pandas import read_excel
- #frame = read_excel(URL)
- # If your data is in a private Azure blob, install 'azure' and use
- # BlobService.get_blob_to_path() with read_table() or read_excel()
- #import azure.storage
- #service = azure.storage.BlobService(ACCOUNT_NAME, ACCOUNT_KEY)
- #service.get_blob_to_path(container_name, blob_name, 'my_data.csv')
- #frame = read_table('my_data.csv', ...
- frame = read_table(
- URL,
-
- # Uncomment if the file needs to be decompressed
- #compression='gzip',
- #compression='bz2',
- # Specify the file encoding
- # Latin-1 is common for data from US sources
- encoding='latin-1',
- #encoding='utf-8', # UTF-8 is also common
- # Specify the separator in the data
- sep=',', # comma separated values
- #sep='\t', # tab separated values
- #sep=' ', # space separated values
- # Ignore spaces after the separator
- skipinitialspace=True,
- # Generate row labels from each row number
- index_col=None,
- #index_col=0, # use the first column as row labels
- #index_col=-1, # use the last column as row labels
- # Generate column headers row from each column number
- header=None,
- #header=0, # use the first line as headers
- # Use manual headers and skip the first row in the file
- #header=0,
- #names=['col1', 'col2', ...],
- )
- # Return a subset of the columns
- #return frame[['col1', 'col4', ...]]
- # Return the entire frame
- return frame
- # =====================================================================
- def get_features_and_labels(frame):
- '''
- Transforms and scales the input data and returns numpy arrays for
- training and testing inputs and targets.
- '''
- # Replace missing values with 0.0, or we can use
- # scikit-learn to calculate missing values (below)
- #frame[frame.isnull()] = 0.0
- # Convert values to floats
- arr = np.array(frame, dtype=np.float)
- # Use the last column as the target value
- X, y = arr[:, :-1], arr[:, -1]
- # To use the first column instead, change the index value
- #X, y = arr[:, 1:], arr[:, 0]
-
- # Use 80% of the data for training; test against the rest
- from sklearn.cross_validation import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
- # sklearn.pipeline.make_pipeline could also be used to chain
- # processing and classification into a black box, but here we do
- # them separately.
-
- # If values are missing we could impute them from the training data
- #from sklearn.preprocessing import Imputer
- #imputer = Imputer(strategy='mean')
- #imputer.fit(X_train)
- #X_train = imputer.transform(X_train)
- #X_test = imputer.transform(X_test)
-
- # Normalize the attribute values to mean=0 and variance=1
- from sklearn.preprocessing import StandardScaler
- scaler = StandardScaler()
- # To scale to a specified range, use MinMaxScaler
- #from sklearn.preprocessing import MinMaxScaler
- #scaler = MinMaxScaler(feature_range=(0, 1))
-
- # Fit the scaler based on the training data, then apply the same
- # scaling to both training and test sets.
- scaler.fit(X_train)
- X_train = scaler.transform(X_train)
- X_test = scaler.transform(X_test)
- # Return the training and test sets
- return X_train, X_test, y_train, y_test
- # =====================================================================
- def evaluate_classifier(X_train, X_test, y_train, y_test):
- '''
- Run multiple times with different classifiers to get an idea of the
- relative performance of each configuration.
- Returns a sequence of tuples containing:
- (title, precision, recall)
- for each learner.
- '''
- # Import some classifiers to test
- from sklearn.svm import LinearSVC, NuSVC
- from sklearn.ensemble import AdaBoostClassifier
- # We will calculate the P-R curve for each classifier
- from sklearn.metrics import precision_recall_curve, f1_score
-
- # Here we create classifiers with default parameters. These need
- # to be adjusted to obtain optimal performance on your data set.
-
- # Test the linear support vector classifier
- classifier = LinearSVC(C=1)
- # Fit the classifier
- classifier.fit(X_train, y_train)
- score = f1_score(y_test, classifier.predict(X_test))
- # Generate the P-R curve
- y_prob = classifier.decision_function(X_test)
- precision, recall, _ = precision_recall_curve(y_test, y_prob)
- # Include the score in the title
- yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall
- # Test the Nu support vector classifier
- classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3)
- # Fit the classifier
- classifier.fit(X_train, y_train)
- score = f1_score(y_test, classifier.predict(X_test))
- # Generate the P-R curve
- y_prob = classifier.decision_function(X_test)
- precision, recall, _ = precision_recall_curve(y_test, y_prob)
- # Include the score in the title
- yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall
- # Test the Ada boost classifier
- classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R')
- # Fit the classifier
- classifier.fit(X_train, y_train)
- score = f1_score(y_test, classifier.predict(X_test))
- # Generate the P-R curve
- y_prob = classifier.decision_function(X_test)
- precision, recall, _ = precision_recall_curve(y_test, y_prob)
- # Include the score in the title
- yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall
- # =====================================================================
- def plot(results):
- '''
- Create a plot comparing multiple learners.
- `results` is a list of tuples containing:
- (title, precision, recall)
-
- All the elements in results will be plotted.
- '''
- # Plot the precision-recall curves
- fig = plt.figure(figsize=(6, 6))
- fig.canvas.set_window_title('Classifying data from ' + URL)
- for label, precision, recall in results:
- plt.plot(recall, precision, label=label)
- plt.title('Precision-Recall Curves')
- plt.xlabel('Precision')
- plt.ylabel('Recall')
- plt.legend(loc='lower left')
- # Let matplotlib improve the layout
- plt.tight_layout()
- # ==================================
- # Display the plot in interactive UI
- plt.show()
- # To save the plot to an image file, use savefig()
- #plt.savefig('plot.png')
- # Open the image file with the default image viewer
- #import subprocess
- #subprocess.Popen('plot.png', shell=True)
- # To save the plot to an image in memory, use BytesIO and savefig()
- # This can then be written to any stream-like object, such as a
- # file or HTTP response.
- #from io import BytesIO
- #img_stream = BytesIO()
- #plt.savefig(img_stream, fmt='png')
- #img_bytes = img_stream.getvalue()
- #print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...'))
- # Closing the figure allows matplotlib to release the memory used.
- plt.close()
- # =====================================================================
- if __name__ == '__main__':
- # Download the data set from URL
- print("Downloading data from {}".format(URL))
- frame = download_data()
- # Process data into feature and label arrays
- print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns)))
- X_train, X_test, y_train, y_test = get_features_and_labels(frame)
- # Evaluate multiple classifiers on the data
- print("Evaluating classifiers")
- results = list(evaluate_classifier(X_train, X_test, y_train, y_test))
- # Display the results
- print("Plotting the results")
- plot(results)