/python_data_analysis/affective_crossvalidation.py
Python | 137 lines | 98 code | 20 blank | 19 comment | 18 complexity | 5d930aa4dd0ee5817de03a6fc7dff74c MD5 | raw file
- """
- to read the file:
- import pandas as pd; tmp = pd.read_csv(<filename>, index_col=0)
- -use balanced error rate
- """
- from scipy.io import loadmat
- import pandas as pd
- from itertools import permutations
- import numpy as np
- from sklearn.cross_validation import StratifiedKFold
- import cPickle as pickle
- perplexity = 30
- games = ["escape", "sahara"]
- windows = [2, 1]
- window_features = {}
- window_features[1] = [278, 318, 130, 126, 10, 259, 336, 246, 223, 303, 158, 332, 233, 201, 291, 162, 234, 347, 117, 227, 274, 304, 219, 263, 118, 293, 235, 346, 120, 189, 97, 292, 276, 172, 294, 288, 177, 147, 321, 236, 249, 334, 173, 187, 289, 160, 96, 178, 323, 119, 260, 99, 220, 253, 184, 202, 175, 324, 281, 98, 248, 230, 252, 176, 245, 317, 205, 74, 285, 231, 77, 218, 250, 143, 188, 68, 306, 232, 216, 310, 217, 16, 194, 14, 210, 78, 311, 17, 1, 225, 290, 85, 2, 133, 46, 181, 307, 31, 308, 207]
- window_features[2] = [303, 318, 158, 126, 10, 223, 278, 259, 336, 219, 77, 97, 289, 130, 201, 304, 249, 233, 162, 332, 189, 248, 274, 227, 347, 74, 96, 117, 246, 294, 118, 234, 187, 291, 99, 16, 78, 346, 120, 288, 252, 220, 292, 263, 147, 232, 293, 236, 172, 321, 177, 276, 253, 160, 334, 184, 235, 98, 310, 119, 224, 178, 173, 230, 250, 17, 323, 199, 216, 133, 260, 143, 202, 19, 231, 306, 75, 175, 176, 217, 188, 311, 205, 183, 324, 307, 218, 221, 251, 68, 84, 194, 210, 225, 20, 317, 182, 181, 132, 245]
- def get_filename(game, window):
- assert game in games
- assert window in windows
- return "../new_data/%s_all_window%d.mat" % (game, window)
- def read_mat(game, window):
- """because of indexing from 0 and skipping the first 2 columns, we need to add 1 to each of the feature numbers
- """
- filename = get_filename(game, window)
- features = [0, 1] + map(lambda x: x + 1, window_features[window])
- return pd.DataFrame(loadmat(filename)["new"])[features]
- def split_by(df, column):
- """generates data frames for each unique value in specified column.
- """
- tmp = df[column]
- for val in tmp.unique():
- yield val, df[tmp == val]
- def preprocess(df):
- """scales data frame to 0 mean and 1 std, fills missing values, then returns as a numpy matrix
- """
- return ((df - df.mean()) / df.std()).fillna(0).as_matrix()
- def load_data():
- data, emotions = dict(), dict()
- for game in games:
- data[game], emotions[game] = dict(), dict()
- for window in windows:
- data[game][window], emotions[game][window] = dict(), dict()
- tmp = read_mat(game, window)
- for val, df in split_by(tmp, 0):
- subject, emotion = df[0], df[1]
- del df[0], df[1]
- data[game][window][val] = preprocess(df)
- emotions[game][window][val] = np.array(emotion)
- return data, emotions
- def balanced_error_rate(y_true, y_pred):
- # return float(sum(y_true != y_pred)) / len(y_true)
- error = []
- for cls in np.unique(y_true):
- true_positive = map(lambda x: x[0] == x[1] == cls, zip(y_true, y_pred))
- true_cls = y_true == cls
- err = float(sum(true_positive)) / sum(true_cls)
- error.append(err)
- return 1.0 - sum(error) / len(error)
- def affective_crossvalidation(clf):
- if 0:
- with open("../new_data/data.pickle", 'w') as outfile:
- tmp = load_data()
- pickle.dump(tmp, outfile)
- else:
- with open("../new_data/data.pickle") as infile:
- tmp = pickle.load(infile)
- data, emotions = tmp
- for game, game2 in permutations(games):
- for window in windows:
- vals = sorted(data[game][window].keys())
- output = np.zeros((len(vals), len(vals) + 2))
- columns = vals + ["Other Game", "10-fold CV"]
- for idx_train, val_train in enumerate(vals):
- X_train = data[game][window][val_train]
- Y_train = emotions[game][window][val_train]
- if len(np.unique(Y_train)) == 1:
- print val_train
- clf.fit(X_train, Y_train)
- # testing player vs other player same game
- for idx_test, val_test in enumerate(vals):
- X_test = data[game][window][val_test]
- Y_test = emotions[game][window][val_test]
- Y_pred = clf.predict(X_test)
- err = balanced_error_rate(Y_test, Y_pred)
- output[idx_train, idx_test] = err
- # testing player vs self on other game
- if val_train in data[game2][window]:
- X_test = data[game2][window][val_train]
- Y_test = emotions[game2][window][val_train]
- Y_pred = clf.predict(X_test)
- err = balanced_error_rate(Y_test, Y_pred)
- else:
- err = -1
- output[idx_train, -2] = err
- # testing player vs self same game
- n_folds = 5
- skf = StratifiedKFold(Y_train, n_folds=n_folds)
- err = []
- for train_idx, test_idx in skf:
- X_train2, X_test = X_train[train_idx], X_train[test_idx]
- Y_train2, Y_test = Y_train[train_idx], Y_train[test_idx]
- clf.fit(X_train2, Y_train2)
- Y_pred = clf.predict(X_test)
- err.append(balanced_error_rate(Y_test, Y_pred))
- err = sum(err) / len(err)
- output[idx_train, -1] = err
- filename = "../new_data/error_%s_%d.csv" % (game, window)
- pd.DataFrame(output, index=vals, columns=columns).to_csv(filename)
- if __name__ == "__main__":
- # from sklearn.svm import SVC
- # svm = SVC()
- from sklearn.ensemble import RandomForestClassifier
- rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
- affective_crossvalidation(rf)