Hold out 방식
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# 전체 데이터를 train과 test로 나눈다.
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=0.2,
stratify=y,
random_state=0)
# 나누어진 train데이터를 validation과 train으로 다시 나눈다.
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
test_size=0.2,
stratify=y_train,
random_state=0)
# Decision tree를 만들어 max_depth값을 변경시켜가며 train 데이터로 학습시킨다.
tree = DecisionTreeClassifier(random_state=0, max_depth=5)
tree.fit(x_train, y_train)
# 예측값을 확인한다.
pred_train = tree.predict(x_train)
pred_val = tree.predict(x_val)
pred_test = tree.predict(x_test)
# 검증한다.
acc_train = accuracy_score(y_train, pred_train)
acc_val = accuracy_score(y_val, pred_val)
acc_test = accuracy_score(y_test, pred_test)
print(acc_train, acc_val, acc_test)
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np
acc_train_list = []
acc_val_list = []
kfold = KFold(5) # 4~5개가 일반적
for train_index, val_index in kfold.split(x):
x_train, y_train = x[train_index], y[train_index]
x_val, y_val = x[val_index], y[val_index]
# 모델
tree = DecisionTreeClassifier(random_state=0)
# 학습
tree.fit(x_train, y_train)
# 평가
pred_train = tree.predict(x_train)
pred_val = tree.predict(x_val)
acc_train_list.append(accuracy_score(y_train, pred_train))
acc_val_list.append(accuracy_score(y_val, pred_val))
print(acc_train_list, np.mean(acc_train_list))
print(acc_val_list, np.mean(acc_val_list))
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np
acc_train_list = []
acc_val_list = []
s_fold = StratifiedKFold(4)
for train_index, val_index in s_fold.split(x, y):
x_train, y_train = x[train_index], y[train_index]
x_val, y_val = x[val_index], y[val_index]
# 모델 생성
tree = DecisionTreeClassifier(random_state=0)
# 학습
tree.fit(x_train, y_train)
# 검증
pred_train = tree.predict(x_train)
pred_val = tree.predict(x_val)
acc_train_list.append(accuracy_score(y_train, pred_train))
acc_val_list.append(accuracy_score(y_val, pred_val))
print(acc_train_list, acc_val_list)
from sklearn.model_selection import cross_val_score
tree = DecisionTreeClassifier(random_state=0)
result_scores = cross_val_score(estimator=tree,
X=x, y=y,
scoring='accuracy',
cv=4)
print(result_scores, result_scores.mean())