/src/pathology.py
Python | 242 lines | 234 code | 5 blank | 3 comment | 0 complexity | 9c17d30caa792409da52432dc0d3cfab MD5 | raw file
- #!/usr/bin/env python
- from sklearn.svm import SVC
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.cross_validation import KFold
- from sklearn.cross_validation import StratifiedKFold
- from sklearn.metrics import roc_curve, auc, precision_score, classification_report
- import pandas as pd
- import numpy as np
- from scipy import interp
- import os
- import pylab as pl
- import config
- import util
- def randomforest(data, targets, tree_num=100):
- model = RandomForestClassifier(n_estimators=tree_num,
- n_jobs=4,
- max_features=data.shape[1]/2+1,
- verbose=0,
- oob_score=True,
- compute_importances=True,
- random_state=12345678)
- model.fit(data, targets)
- return model
- def get_control_data():
- # invivo single control
- folder = config.rat_invivo_single_folder
- '''
- fpath = os.path.join(folder, "expr_Control_3hr.csv")
- df_3hr = pd.read_csv(fpath, header=0, index_col=0)
- fpath = os.path.join(folder, "expr_Control_6hr.csv")
- df_6hr = pd.read_csv(fpath, header=0, index_col=0)
- fpath = os.path.join(folder, "expr_Control_9hr.csv")
- df_9hr = pd.read_csv(fpath, header=0, index_col=0)
- '''
- fpath = os.path.join(folder, "expr_Control_24hr.csv")
- df_24hr = pd.read_csv(fpath, header=0, index_col=0)
- #single_control_df = pd.concat([df_3hr, df_6hr, df_9hr, df_24hr])
- single_control_df = pd.concat([df_24hr])
- '''
- # invivo repeat control
- folder = config.rat_invivo_repeat_folder
-
- fpath = os.path.join(folder, "expr_Control_4day.csv")
- df_4day = pd.read_csv(fpath, header=0, index_col=0)
- fpath = os.path.join(folder, "expr_Control_8day.csv")
- df_8day = pd.read_csv(fpath, header=0, index_col=0)
- fpath = os.path.join(folder, "expr_Control_15day.csv")
- df_15day = pd.read_csv(fpath, header=0, index_col=0)
- fpath = os.path.join(folder, "expr_Control_29day.csv")
- df_29day = pd.read_csv(fpath, header=0, index_col=0)
- repeat_control_df = pd.concat([df_4day, df_8day, df_15day, df_29day])
- '''
- #control_df = pd.concat([single_control_df, repeat_control_df])
- control_df = pd.concat([single_control_df])
- del control_df['DILI_Class']
- return control_df
- def invivo():
- control_df = get_control_data()
- print "Control:", control_df.shape
- # single collapsed expression
- single_expr_df = pd.read_csv(config.rat_invivo_single_expression, header=None, index_col=False)
- sample_names = util.read_sample_names(config.rat_invivo_single_sample)
- gene_names = util.read_gene_names(config.rat_invivo_single_gene)
- single_expr_df.columns = gene_names
- single_expr_df.index = sample_names
- # repeat collapsed expression
- repeat_expr_df = pd.read_csv(config.rat_invivo_repeat_expression, header=None, index_col=False)
- sample_names = util.read_sample_names(config.rat_invivo_repeat_sample)
- gene_names = util.read_gene_names(config.rat_invivo_repeat_gene)
- repeat_expr_df.columns = gene_names
- repeat_expr_df.index = sample_names
- findings = ["Necrosis", "Hypertrophy", "Microgranuloma", "Cellular infiltration", "Change"]
- for finding in findings:
- pathology_df = pd.read_csv("../data/pathology/%s.csv" % finding, header=0, index_col=False)
- expr_df = pd.DataFrame(columns=single_expr_df.columns)
- for ind, row in single_expr_df.iterrows():
- snames = ind.split("_")
- snames = [int(sname.lstrip("00")) for sname in snames]
- for sname in snames:
- if sname in list(pathology_df['CEL']):
- expr_df = expr_df.append(row, verify_integrity=True)
- break
- for ind, row in repeat_expr_df.iterrows():
- snames = ind.split("_")
- snames = [int(sname.lstrip("00")) for sname in snames]
- for sname in snames:
- if sname in list(pathology_df['CEL']):
- expr_df = expr_df.append(row, verify_integrity=True)
- break
-
- targets = []
- targets.extend([0]*control_df.shape[0])
- targets.extend([1]*expr_df.shape[0])
- targets = np.array(targets)
- data_df = pd.concat([control_df, expr_df])
- print data_df.shape
- print targets.shape
- data = np.array(data_df)
- kf = KFold(data.shape[0], k=5, shuffle=True)
- mean_tpr = 0.0
- mean_fpr = np.linspace(0, 1, 100)
- all_tpr = []
- count = 1
- for train_index, test_index in kf:
- model = randomforest(data[train_index], targets[train_index], tree_num=100)
- probas_ = model.predict_proba(data[test_index])
- fpr, tpr, thresholds = roc_curve(targets[test_index], probas_[:, 1])
- mean_tpr += interp(mean_fpr, fpr, tpr)
- mean_tpr[0] = 0.0
- roc_auc = auc(fpr, tpr)
- count += 1
- mean_tpr /= len(kf)
- mean_tpr[-1] = 1.0
- mean_auc = auc(mean_fpr, mean_tpr)
- pl.plot(mean_fpr, mean_tpr, label='Mean AUC = %0.2f (%s)' % (mean_auc, finding), lw=2)
-
- pl.xlim([-0.05, 1.05])
- pl.ylim([-0.05, 1.05])
- pl.xlabel('False Positive Rate')
- pl.ylabel('True Positive Rate')
- pl.title('Mean ROC Curve')
- pl.legend(loc="lower right")
- pl.savefig('../plots/all_pathology.pdf')
- pl.show()
-
- def invitro():
- expr_df = pd.read_csv(config.rat_invitro_expression, header=None, index_col=False)
- sample_names = util.read_sample_names(config.rat_invitro_sample)
- gene_names = util.read_gene_names(config.rat_invitro_gene)
- expr_df.columns = gene_names
- expr_df.index = sample_names
- findings = ["Necrosis", "Hypertrophy", "Microgranuloma", "Cellular infiltration", "Change"]
-
- new_expr_df = pd.DataFrame(columns=expr_df.columns)
- for finding in findings:
- barcodes = []
- with open("../data/invitro_pathology/%s" % finding) as fh:
- for line in fh:
- barcodes.append(int(line))
- for ind, row in expr_df.iterrows():
- snames = ind.split("_")
- snames = [int(sname.lstrip("00")) for sname in snames]
- for sname in snames:
- if sname in barcodes:
- #new_expr_df = new_expr_df.append(row, verify_integrity=True)
- new_expr_df = new_expr_df.append(row)
- break
-
- for finding in findings:
- pos_data_df = pd.DataFrame(columns=new_expr_df.columns)
- neg_data_df = pd.DataFrame(columns=new_expr_df.columns)
-
- barcodes = []
- with open("../data/invitro_pathology/%s" % finding) as fh:
- for line in fh:
- barcodes.append(int(line))
- for ind, row in new_expr_df.iterrows():
- snames = ind.split("_")
- snames = [int(sname.lstrip("00")) for sname in snames]
- flag = True
- for sname in snames:
- if sname in barcodes:
- pos_data_df = pos_data_df.append(row)
- flag = False
- break
- if flag:
- neg_data_df = neg_data_df.append(row)
- targets = []
- targets.extend([1]*pos_data_df.shape[0])
- targets.extend([0]*neg_data_df.shape[0])
- targets = np.array(targets)
- data_df = pd.concat([pos_data_df, neg_data_df])
- data = np.array(data_df)
- print data.shape
- print targets.shape
- kf = KFold(data.shape[0], k=5, shuffle=True)
- mean_tpr = 0.0
- mean_fpr = np.linspace(0, 1, 100)
- all_tpr = []
- count = 1
- for train_index, test_index in kf:
- model = randomforest(data[train_index], targets[train_index], tree_num=100)
- probas_ = model.predict_proba(data[test_index])
- fpr, tpr, thresholds = roc_curve(targets[test_index], probas_[:, 1])
- mean_tpr += interp(mean_fpr, fpr, tpr)
- mean_tpr[0] = 0.0
- roc_auc = auc(fpr, tpr)
- count += 1
- mean_tpr /= len(kf)
- mean_tpr[-1] = 1.0
- mean_auc = auc(mean_fpr, mean_tpr)
- pl.plot(mean_fpr, mean_tpr, label='Mean AUC = %0.2f (%s)' % (mean_auc, finding), lw=2)
-
- pl.xlim([-0.05, 1.05])
- pl.ylim([-0.05, 1.05])
- pl.xlabel('False Positive Rate')
- pl.ylabel('True Positive Rate')
- pl.title('Mean ROC Curve')
- pl.legend(loc="lower right")
- pl.savefig('../plots/invitro_pathology.pdf')
- pl.show()
- def main():
- #invivo()
- invitro()
- if __name__ == "__main__":
- main()