/AI/test_sklearn.py
Python | 109 lines | 82 code | 18 blank | 9 comment | 12 complexity | d891933cc0ce8c2fee82f2dd1fc0062d MD5 | raw file
- import pandas as pd
- import numpy as np
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.naive_bayes import GaussianNB
- from sklearn.metrics import confusion_matrix
- from sklearn.metrics import accuracy_score, precision_score, recall_score
- df_origin = pd.read_csv('heart_failure_clinical_records_dataset.csv')
- df = df_origin
- def reset_creatinine_phosphokinase(x):
- if x>1000:
- return round(11 + x/1000)
- return round(x/100)
- df['creatinine_phosphokinase'] = df['creatinine_phosphokinase'].apply(lambda x: reset_creatinine_phosphokinase(x))
- df['creatinine_phosphokinase']=df['creatinine_phosphokinase'].astype(int);
- df['time']=df['time']/100
- df['time']=df['time'].astype(int)
- df['ejection_fraction']=df['ejection_fraction']/20
- df['ejection_fraction']=df['ejection_fraction'].astype(int)
- quantile = pd.Series(np.arange(4))
- quantile[0] =df['platelets'].quantile(0.2)
- quantile[1] =df['platelets'].quantile(0.4)
- quantile[2] =df['platelets'].quantile(0.6)
- quantile[3] =df['platelets'].quantile(0.8)
- def reset_by_quantile(x, quant):
- if(x<quant[0]):
- return 0
- elif(x<quant[1]):
- return 1
- elif(x<quant[2]):
- return 2
- elif(x<quant[3]):
- return 3
- else:
- return 4
- df['platelets']=df['platelets'].apply(lambda x: reset_by_quantile(x,quantile))
- df['platelets']=df['platelets'].astype(int)
- quantile = pd.Series(np.arange(4))
- quantile[0] =df['serum_creatinine'].quantile(0.2)
- quantile[1] =df['serum_creatinine'].quantile(0.4)
- quantile[2] =df['serum_creatinine'].quantile(0.6)
- quantile[3] =df['serum_creatinine'].quantile(0.8)
- def reset_by_quantile(x, quant):
- if(x<quant[0]):
- return 0
- elif(x<quant[1]):
- return 1
- elif(x<quant[2]):
- return 2
- elif(x<quant[3]):
- return 3
- else:
- return 4
- df['serum_creatinine']=df['serum_creatinine'].apply(lambda x: reset_by_quantile(x,quantile))
- df['serum_creatinine']=df['serum_creatinine'].astype(int)
- df = df.loc[:,['anaemia','creatinine_phosphokinase', \
- 'serum_creatinine','ejection_fraction','diabetes','platelets', 'high_blood_pressure','sex', \
- 'smoking','time','DEATH_EVENT']]
- #realdata = df.loc[0:20,['age','anaemia','diabetes','high_blood_pressure','sex','smoking','DEATH_EVENT']]
- #realdata['age']=round(realdata['age']/20)
- #realdata['age']=realdata['age'].astype(int);
- #realdata = df.loc[0:20,['diabetes','sex','smoking','DEATH_EVENT']]
- df_train = df.sample(frac=0.8)
- rowlist=[]
- for indexs in df_train.index:
- rowlist.append(indexs)
- df_test=df.drop(rowlist,axis=0)
- print(df.loc[0,:])
- print(df.loc[1,:])
- print(df.loc[150,:])
- #print(df_train)
- #print(df_test)
- X = df_train.iloc[:,:10]
- Y = df_train.iloc[:,10]
- X_t = df_test.iloc[:,:10]
- Y_t = df_test.iloc[:,10]
- #print(X)
- #print(Y)
- #model = DecisionTreeClassifier()
- model = GaussianNB()
- model.fit(X,Y)
- pred = model.predict(X_t)
- print(pred)
- print(Y_t)
- confusion_matrix = confusion_matrix(Y_t,pred)
- print(confusion_matrix)
- accuracy = accuracy_score(Y_t,pred)
- precision = precision_score(Y_t,pred)
- recall = recall_score(Y_t,pred)
- print(accuracy)
- print(precision)
- print(recall)