교차 검증(Cross Validation)
: 모델의 학습 과정에서 학습 / 검증데이터를 나눌때 단순히 1번 나누는게 아니라 K번 나누고 각각의 학습 모델의 성능을 비교하는 방법hold out
: train_set, test_set으로 나누는 방법k-fold cross validation
: train_set을 k개로 나누어 새로운 train_set, test_set으로 나누어 평균값 사용stratified k-fold cross validation
: 데이터의 분포가 다르다면 분포를 유지하는 방법# simple test
import numpy as np
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
# 랜덤으로 하고 싶다면
# kf = KFold(n_splits=2, random_state = val, shuffle=True)
kf = KFold(n_splits=2)
print(kf.get_n_splits)
print(kf)
for train_idx, test_idx in kf.split(X):
print('----- idx')
print(train_idx, test_idx)
print('----- train idx')
print(X[train_idx])
print('----- val data')
print(X[test_idx])
import pandas as pd
red_wine = pd.read_csv('winequality-red.csv', sep=';')
white_wine = pd.read_csv('winequality-white.csv', sep=';')
red_wine['color'] = 1
white_wine['color'] = 0
wine = pd.concat([red_wine, white_wine])
wine.reset_index(drop=True, inplace=True)
wine.head(2)
# 데이터 정리
wine['taste'] = [1. if grade > 5 else 0. for grade in wine['quality']]
X = wine.drop(['taste' , 'quality'], axis=1)
y = wine['taste']
# DecisionTree로 분류
# 일반 데이터
# 최선인가? 라고 한다면 확신할 수 없다.
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : {}'.format(accuracy_score(y_train, y_pred_tr)))
print('Test Acc : {}'.format(accuracy_score(y_test, y_pred_test)))
plt.figure(figsize=(12, 7))
plot_tree(wine_tree, feature_names = X_train.columns)
plt.show()
# DecisionTree로 분류
# 교차 검증 데이터
# KFold
# 대체적으로 5-Fold를 많이 쓴다.
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
kFold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
# 각각의 fold로 학습 후 acc
cv_accuracy = []
for train_idx, test_idx in kFold.split(X):
# train, test 데이터 구분
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
# 해당 train, test로 학습
wine_tree_cv.fit(X_train, y_train)
# 성과(accuracy) 확인
pred = wine_tree_cv.predict(X_test)
cv_accuracy.append(accuracy_score(y_test, pred))
cv_accuracy
# acc의 분산이 크지않다면 평균을 대표값으로 한다.
np.mean(cv_accuracy), np.var(cv_accuracy), np.std(cv_accuracy)
# DecisionTree로 분류
# StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
skFold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
# 각각의 fold로 학습 후 acc
cv_accuracy = []
for train_idx, test_idx in skFold.split(X, y):
# train, test 데이터 구분
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
# 해당 train, test로 학습
wine_tree_cv.fit(X_train, y_train)
# 성과(accuracy) 확인
pred = wine_tree_cv.predict(X_test)
cv_accuracy.append(accuracy_score(y_test, pred))
cv_accuracy
# acc의 분산이 크지않다면 평균을 대표값으로 한다.
np.mean(cv_accuracy), np.var(cv_accuracy), np.std(cv_accuracy)
cross_val_score
을 사용한다.from sklearn.model_selection import cross_val_score
skFold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skFold)
def skfold_dt(depth):
from sklearn.model_selection import cross_val_score
skFold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=depth, random_state=13)
print(cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skFold))
skfold_dt(3)
cross validate
사용from sklearn.model_selection import cross_validate
skFold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
cross_validate(wine_tree_cv, X, y, scoring=None, cv=skFold, return_train_score=True)
하이퍼파라미터 튜닝
: 모델의 성능을 확보하기 위해 조절하는 주요 설정값Hyperparameter
표현 : 손(수동)으로 조정해야하는 값import pandas as pd
red_wine = pd.read_csv('winequality-red.csv', sep=';')
white_wine = pd.read_csv('winequality-white.csv', sep=';')
red_wine['color'] = 1
white_wine['color'] = 0
wine = pd.concat([red_wine, white_wine])
wine.reset_index(drop=True, inplace=True)
# 데이터 정리
wine['taste'] = [1. if grade > 5 else 0. for grade in wine['quality']]
X = wine.drop(['taste' , 'quality'], axis=1)
y = wine['taste']
GridSearchCV
: 그리드를 사용한 복수 하이퍼 파라미터 최적화, 모형 래퍼(Wrapper) 성격의 클래스validation_curve
: 단일 하이퍼 파라미터 최적화ParameterGrid
: 복수 파라미터 최적화용 그리드# GridSearchCV
# 결과를 확인하고 싶은 파라미터 정의
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
params = {'max_depth' : [2, 4, 7, 10]}
wine_tree = DecisionTreeClassifier(max_depth=2, random_state = 13)
# 분류기 지정, 측정할 파라미터 지정, cross validation
gridSearch = GridSearchCV(estimator=wine_tree, param_grid=params, cv=5)
# train, test는 gridSearch에서 해준다.
# n_jobs : CPU 코어를 보다 병렬로 활용한다. 따라서 높이면 속도가 빨라진다.
gridSearch.fit(X, y)
import pprint
pp = pprint.PrettyPrinter(indent = 4)
pp.pprint(gridSearch.cv_results_)
# 최적의 성능을 가진 모델
print(gridSearch.best_estimator_)
# 최적일 경우의 값
print(gridSearch.best_score_)
# 최적일 경우의 parameters
print(gridSearch.best_params_)
pipeline을 적용한 모델에 GridSearch를 적용하고 싶다면
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
# Pipeline 설정
estimators = [
('scaler', StandardScaler()),
('clf', DecisionTreeClassifier(random_state=13))
]
pipe = Pipeline(estimators)
# GridSearchCV 설정
# pipeline의 옵션을 수정하기위해 __ 사용
params = [ { 'clf__max_depth' : [2, 4, 7, 10] } ]
GridSearch = GridSearchCV(estimator = pipe, param_grid = params, cv = 5)
GridSearch.fit(X, y)
# 최적 모델
print(GridSearch.best_estimator_)
print(GridSearch.best_score_)
print(GridSearch.best_params_)
# 최적 모델 트리 그리기
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plt.figure(figsize=(12, 7))
plot_tree(GridSearch.best_estimator_['clf'], feature_names=X.columns,
class_names=['W', 'R'],
filled=True)
plt.show()
표로 성능 결과 정리하기
import pandas as pd
score_df = pd.DataFrame(GridSearch.cv_results_)
score_df[['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]