import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.figure(figsize=(12,10))
sns.heatmap(PIMA.corr(), cmap='YlGnBu')
plt.show()
(PIMA==0).astype(int).sum()
zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI']
PIMA[zero_features] = PIMA[zero_features].replace(0, PIMA[zero_features].mean())
(PIMA==0).astype(int).sum()
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, recall_score, precision_score, roc_auc_score, f1_score)
X = PIMA.drop(['Outcome'], axis=1)
y = PIMA['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)
estimators = [('scaler', StandardScaler()), ('clf', LogisticRegression(solver='liblinear', random_state=13))]
pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, pred))
print('Recall: ', recall_score(y_test, pred))
print('Precision: ', precision_score(y_test, pred))
print('AUC score: ', roc_auc_score(y_test, pred))
print('f1 score: ', f1_score(y_test, pred))
'''
Accuracy: 0.7727272727272727
Recall: 0.6111111111111112
Precision: 0.7021276595744681
AUC score: 0.7355555555555556
f1 score: 0.6534653465346535
'''
# 다변수 방정식의 각 계수 값
coeff = list(pipe['clf'].coef_[0])
labels = list(X_train.columns)
features = pd.DataFrame({'Features': labels, 'importance': coeff})
features.sort_values(by=['importance'], inplace=True)
features['positive'] = features['importance'] > 0
features.set_index('Features', inplace=True)
features['importance'].plot(kind='barh', figsize=(11,6), color=features['positive'].map({True:'blue', False:'red'}))
plt.xlabel('Importance')
plt.show()
Reference
1) 제로베이스 데이터스쿨 강의자료