트리 분류

ManSonCoding·2021년 4월 1일
0
  • 모든 코드는 ''혼자 공부하는 머신러닝'' 에 저작권이 있습니다.

트리 분류

# 데이터를 가져온다
import pandas as pd

wine = pd.read_csv("https://bit.ly/wine_csv_data")
wine.describe()
alcohol sugar pH class
count 6497.000000 6497.000000 6497.000000 6497.000000
mean 10.491801 5.443235 3.218501 0.753886
std 1.192712 4.757804 0.160787 0.430779
min 8.000000 0.600000 2.720000 0.000000
25% 9.500000 1.800000 3.110000 1.000000
50% 10.300000 3.000000 3.210000 1.000000
75% 11.300000 8.100000 3.320000 1.000000
max 14.900000 65.800000 4.010000 1.000000
# sklearn을 사용하기 위하여 데이터를 넘파이 배열로 바꿔준다
data = wine[['alcohol','sugar','pH']].to_numpy()
target = wine['class'].to_numpy()
# 훈련용 데이터와 테스트 데이터로 분리한다.

from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(data, target,test_size = 0.2,  random_state =42 )
# alchol sugar ph 의 크기가 상이하므로, Standardscaler 로 전처리를 해준다.

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)
# 로지스틱 회귀를 이용하여 예측 모델 생성
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_scaled, train_target)
print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))
0.7808350971714451
0.7776923076923077
print(lr.coef_, lr.intercept_)
[[ 0.51270274  1.6733911  -0.68767781]] [1.81777902]

로지스틱 회귀로는 분류 기준을 손쉽게 알 수 없는 문제점이 생긴다.

# 결정트리를 이용하여 모델을 학습
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=3, random_state = 42)
dt.fit(train_scaled, train_target)
print(dt.score(train_scaled, train_target))
print(dt.score(test_scaled, test_target))
0.8454877814123533
0.8415384615384616
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plt.style.use(['seaborn'])
plt.figure(figsize= (10,7))
plot_tree(dt, filled = True, feature_names = ['alchol', 'sugar','pH'])
plt.show()

png

## 트리모델은 스케일 변수에 아무런 영향을 미치지 않기 때문에 전처리 할 필요가 없다.
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth= 3, random_state = 42)
dt.fit(train_input, train_target)
print(dt.score(train_input, train_target))
print(dt.score(test_input, test_target))

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plt.style.use(['seaborn'])
plt.figure(figsize= (10,7))
plot_tree(dt, filled = True, feature_names = ['alchol', 'sugar','pH'])
plt.show()
0.8454877814123533
0.8415384615384616

png

dt.feature_importances_
array([0.12345626, 0.86862934, 0.0079144 ])

교차검증과 그리드서치

테스트 성능에 의존하는 모델이 되지 않기 위하여 검증 데이터로 떼어낸다.

# 검증 데이터를 분리한다.
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size = 0.2, random_state = 42)
# 검증 데이터로 모델 증명하기

## 트리모델은 스케일 변수에 아무런 영향을 미치지 않기 때문에 전처리 할 필요가 없다.
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier( random_state = 42)
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(test_input, test_target))
0.9971133028626413
0.8569230769230769
## 교차검증 - 검증 데이터 셋 K-fold 검증
import numpy as np
from sklearn.model_selection import cross_validate
scores = cross_validate(dt, train_input, train_target)
print(scores)
print(np.mean(scores['test_score']))
{'fit_time': array([0.0049479 , 0.00485611, 0.0052228 , 0.00464725, 0.00481296]), 'score_time': array([0.00066304, 0.00068879, 0.00069308, 0.00074601, 0.00067782]), 'test_score': array([0.86923077, 0.86826923, 0.8825794 , 0.86717998, 0.85370549])}
0.8681929740134745
from sklearn.model_selection import StratifiedKFold
scores = cross_validate(dt, train_input, train_target, cv = StratifiedKFold())
print(np.mean(scores['test_score']))
0.8681929740134745
## 하이퍼 파라미터 자동 설정을 위하여, 그리드 서치를 사용한다.
from sklearn.model_selection import GridSearchCV
params ={'min_impurity_decrease' : [0.0001,0.0002,0.0003,0.0004,0.0005]}
gs = GridSearchCV(DecisionTreeClassifier(random_state = 42), params, n_jobs = -1)
gs.fit(train_input, train_target)
GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'min_impurity_decrease': [0.0001, 0.0002, 0.0003,
                                                   0.0004, 0.0005]})
# 가장 좋은 그리드 서치의 결정트리 점수를 출력한다.
dt = gs.best_estimator_
print(dt.score(train_input, train_target))
print(gs.best_params_)
0.9615162593804117
{'min_impurity_decrease': 0.0001}
best_index = np.argmax(gs.cv_results_['mean_test_score']) # 가장 높은 인덱스
print(gs.cv_results_['params'][best_index])
{'min_impurity_decrease': 0.0001}
# 파라미터 추가
params = {'min_impurity_decrease' : np.arange(0.0001,0.001,0.0001),
          'max_depth': range(5,20,1),
          'min_samples_split': range(2,100,10)}
gs = GridSearchCV(DecisionTreeClassifier(random_state = 42), params, n_jobs = -1)
gs.fit(train_input, train_target)
GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 20),
                         'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                         'min_samples_split': range(2, 100, 10)})
dt = gs.best_estimator_
print(dt.score(train_input, train_target))
print(gs.best_estimator_)
0.892053107562055
DecisionTreeClassifier(max_depth=14, min_impurity_decrease=0.0004,
                       min_samples_split=12, random_state=42)
from scipy.stats import uniform, randint

# 파라미터 추가
params = {'min_impurity_decrease' : uniform(0.0001,0.001),
          'max_depth': randint(20,50),
          'min_samples_split': randint(1,25),
         'min_samples_leaf': randint(1,25)}
# 랜덤 서치
from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(DecisionTreeClassifier(random_state = 42) , params, n_iter = 100, n_jobs = -1, random_state = 42)
gs.fit(train_input, train_target)
dt = gs.best_estimator_
print(dt.score(train_input, train_target))
print(dt.score(test_input, test_target))

0.8928227823744468
0.86
 !jupyter nbconvert --to markdown 트리분류.ipynb

profile
AlwaysILearned

0개의 댓글