교차 검증(Cross Validation) : 모델의 학습 과정에서 학습 / 검증데이터를 나눌때 단순히 1번 나누는게 아니라 K번 나누고 각각의 학습 모델의 성능을 비교하는 방법
# simple test
import numpy as np
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
# 랜덤으로 하고 싶다면
# kf = KFold(n_splits=2, random_state = val, shuffle=True)
kf = KFold(n_splits=2)
print(kf.get_n_splits)
print(kf)
for train_idx, test_idx in kf.split(X):
print('----- idx')
print(train_idx, test_idx)
print('----- train idx')
print(X[train_idx])
print('----- val data')
print(X[test_idx])
import pandas as pd
red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'
red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
red_wine['color'] = 1.
white_wine['color'] = 0.
wine = pd.concat([red_wine, white_wine])
wine['taste'] = [1. if grade > 5 else 0. for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Tran Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
Acc를 신뢰할수 있는지, 위 데이터 분리가 최선 ?
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
for train_idx,test_idx in kfold.split(X):
print(len(train_idx), len(test_idx))
cv_accuracy = []
for train_idx,test_idx in kfold.split(X):
X_train = X.iloc[train_idx]
X_test = X.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]
wine_tree_cv.fit(X_train, y_train)
pred = wine_tree_cv.predict(X_test)
cv_accuracy.append(accuracy_score(y_test, pred))
cv_accuracy
np.mean(cv_accuracy)
from sklearn.model_selection import StratifiedKFold
skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)
cv_accuracy = []
for train_idx,test_idx in skfold.split(X,y):
X_train = X.iloc[train_idx]
X_test = X.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]
wine_tree_cv.fit(X_train, y_train)
pred = wine_tree_cv.predict(X_test)
cv_accuracy.append(accuracy_score(y_test, pred))
cv_accuracy
np.mean(cv_accuracy)
from sklearn.model_selection import cross_val_score
skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=5, random_state=13)
cross_val_score(wine_tree_cv, X, y, cv=skfold)
def skfold_df(depth):
from sklearn.model_selection import cross_val_score
skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=depth, random_state=13)
print(cross_val_score(wine_tree_cv, X, y, cv=skfold))
skfold_df(3)
from sklearn.model_selection import cross_validate
cross_validate(wine_tree_cv, X, y, cv=skfold, return_train_score=True)
모델의 성능을 확보하기 위해 조절하는 설정 값
Feature Enginerring : 특성을 관찰하고 머신러닝 모델이 보다 학습결과를 잘 이끌어 낼 수있도록 변경하거나 새로운 특성을 찾아내는 작업
- Training, Vailidation, Test로 데이터 구분
- Train 데이터로 모델 생성 -> 학습 -> Valilidation 데이터로 성과 확인
- 결과를 통해 Hyperparameter 표현 : 손(수동)으로 조정해야하는 값
Hyperparameter
- 학습률
- 학습률 스케줄링 방법
- 활성화 함수
- 손실 함수
- 훈련 반복횟수
- 가중치 초기화 방법
- 정규화 방법
- 적층할 계층의 수
튜닝대상
import pandas as pd
red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'
red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
red_wine['color'] = 1.
white_wine['color'] = 0.
wine = pd.concat([red_wine, white_wine])
wine['taste'] = [1. if grade > 5 else 0. for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
GridSearchCV : 그리드를 사용한 복수 하이퍼 파라미터 최적화, 모형 래퍼(Wrapper) 성격의 클래스
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
params = {'max_depth' : [2, 4, 7, 10]}
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
gridsearch = GridSearchCV(estimator=wine_tree, param_grid=params, cv=5)
gridsearch.fit(X,y)
여기서 n_jobs 옵션을 높여주면 CPU의 코어를 보다 병렬로 활용함.
Core가 많으면 n_jobs를 높이면 속도가 빨라짐.
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gridsearch.cv_results_)
gridsearch.best_estimator_
gridsearch.best_score_
gridsearch.best_params_
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
estimators = [
('scaler',StandardScaler()),
('clf',DecisionTreeClassifier())
]
pipe = Pipeline(estimators)
param_grid = [{'clf__max_depth':[2,4,7,10]}]
GridSearch = GridSearchCV(estimator=pipe, param_grid=param_grid, cv =5)
GridSearch.fit(X,y)
GridSearch.best_estimator_
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plt.figure(figsize=(12, 7))
plot_tree(GridSearch.best_estimator_['clf'], feature_names=X.columns,
class_names=['W', 'R'],
filled=True)
plt.show()