train_test_split()
: 머신러닝에서 train set과 test set으로 분할한다.
random_state
출처: https://bigdaheta.tistory.com/54
import pandas as pd
wine_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv'
wine = pd.read_csv(wine_url, index_col=0) # index_col=0 -> 인덱스 칼럼 지정
wine.head()

wine['taste'] = [1. if grade>5 else 0. for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression(solver='liblinear', random_state=13)
lr.fit(X_train, y_train)
y_pred_tr = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
estimators = [
('scaler', StandardScaler()),
('clf', LogisticRegression(solver='liblinear', random_state=13))
]
pipe = Pipeline(estimators)
'scaler' -> 스케일러 단계
'clf' -> 분류기
pipe.fit(X_train, y_train)
y_pred_tr = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))

from sklearn.tree import DecisionTreeClassifier
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
models = {
'logistic regression':pipe,
'decision tree':wine_tree
}
from sklearn.metrics import roc_curve
plt.figure(figsize=(10, 8))
plt.plot([0, 1], [0, 1])
for model_name, model in models.items():
pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, pred)
plt.plot(fpr, tpr, label=model_name)
plt.grid()
plt.legend()
plt.show()

PIMA_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/diabetes.csv'
PIMA = pd.read_csv(PIMA_url)
PIMA.head()

PIMA.info()

PIMA = PIMA.astype('float')
PIMA.info()

import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 10))
sns.heatmap(PIMA.corr(), cmap="YlGnBu")
plt.show()

(PIMA==0).astype(int).sum()

cf) EDA의 목적 => 이상한 걸 찾기 위함.
zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI']
PIMA[zero_features] = PIMA[zero_features].replace(0, PIMA[zero_features].mean())
(PIMA==0).astype(int).sum()
X = PIMA.drop(['Outcome'], axis=1)
y = PIMA['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)
estimators = [
('scaler', StandardScaler()),
('clf', LogisticRegression(solver='liblinear', random_state=13))
]
pipe_lr = Pipeline(estimators)
pipe_lr.fit(X_train, y_train)
pred = pipe_lr.predict(X_test)
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score
from sklearn.metrics import roc_auc_score, f1_score
print('Acciracy : ', accuracy_score(y_test, pred))
print('Recall : ', recall_score(y_test, pred))
print('Precision : ', precision_score(y_test, pred))
prin
t('AUC score : ', roc_auc_score(y_test, pred))
print('f1 score : ', f1_score(y_test, pred))

coeff = list(pipe_lr['clf'].coef_[0])
labels = list(X_train.columns)
coeff

feature = pd.DataFrame({'Feature': lables, 'importance': coeff})
feature

features.sort_values(by=['importance'], ascending=True, inplace=True)
features

features['positive'] = features['importance'] > 0
features

features.set_index('Features', inplace=True)
features

features['importance'].plot(kind='barh',
figsize=(11, 6),
color=features['positive'].map({True:'blue',
False:'red'}))
plt.xlabel('Importance')
plt.show()
