from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(
data, target, test_size=0.2, random_state=42
)
sub_input, val_input, sub_target, val_target = train_test_split(
train_input, train_target, test_size=0.2, random_state=42
)
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))
0.9971133028626413
0.864423076923077
from sklearn.model_selection import cross_validate
scores = cross_validate(dt, train_input, train_target)
print(scores)
import numpy as np
print(np.mean(scores['test_score']))
{'fit_time': array([0.00821614, 0.00822687, 0.00734401, 0.00731444, 0.00711036]),
'score_time': array([0.00078368, 0.0007267 , 0.00067711, 0.00068378, 0.00078654]),
'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}
0.855300214703487 // ๊ฒ์ฆ score ํ๊ท (์ด๋ฆ๋ง test)
GridSearchCV()
min_impurity_decrease : ๋ถ๋ชจ์ ์์ ๋
ธ๋๊ฐ ๋ถ์๋ ์ฐจ์ด(๊ฐ์์จ)์ ์ต์๊ฐ์ ์ง์ ํ์ฌ ์ด๋ณด๋ค ์์ผ๋ฉด ๋ถํ X
(= ์ ๋ณด์ด๋. ํด ์๋ก ์ข์ ๋ถํ )
5ํด๋ ๊ต์ฐจ๊ฒ์ฆ x 5๋ฒ = 25๊ฐ ๋ชจ๋ธ
n_jobs = -1 : cpu์ ๋ชจ๋ ์ฝ์ด๋ฅผ ์ฌ์ฉํด ๋ณ๋ ฌ๋ก ์์
์ต์ ์ ๊ฐ์ best_estimator_ ์์ฑ์ ๋ฃ์ด ๊ฒฐ์ ํธ๋ฆฌ๋ก
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)
dt = gs.best_estimator_
print(dt.score(train_input, train_target))
print(gs.best_params_)
print(gs.cv_results_['mean_test_score'])
0.9615162593804117
{'min_impurity_decrease': 0.0001} // ์ต์
[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]
-> ๊ต์ฐจ๊ฒ์ฆ ์ ์
from scipy.stats import uniform, randint
rgen = randint(0, 10)
rgen.rvs(10)
array([1, 0, 4, 8, 6, 2, 2, 3, 6, 9]) // ๋๋ค ์ ๋ฝ์์ค
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
'max_depth': randint(20, 50),
'min_samples_split': randint(2, 25),
'min_samples_leaf': randint(1, 25),
}
from sklearn.model_selection import RandomizedSearchCV
gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)
print(gs.best_params_)
print(np.max(gs.cv_results_['mean_test_score']))
dt = gs.best_estimator_
print(dt.score(test_input, test_target))
{'max_depth': 39, 'min_impurity_decrease': 0.00034102546602601173,
'min_samples_leaf': 7, 'min_samples_split': 13}
0.8695428296438884
// mean_test_score์ ์ต๋๊ฐ = best_params_ ๋ก ์ฐพ์ ๊ฒ์ฆ๊ฐ
0.86 // ์ต์ ๊ฐ์ผ๋ก Test
๐ํผ๊ณต MLDL-12