오늘은 연휴 전 마지막 수업이었다. 의사결정나무에 이어 앙상블, 랜덤포레스트에 대한 내용을 배웠다.
| 구분 | 불순도 | 특징 |
|---|---|---|
| CART | 지니 계수 사용 | 이진 분할 / 재귀적 |
| C5.0 | 엔트로피 사용 | 다중 분할 / 재귀적 |
| CHAID | 카이제곱 통계량 사용 | 다중 분할 / 범주형 특성만 처리 |
from sklearn.tree import DecisionTreeClassifier
criterion : 분할 품질 측정 기준 gini, entropy, log_lossmax_depth : 트리의 최대 깊이 설정min_samples_split : 부모 노드를 분할하기 위한 최소 샘플 개수를 설정min_samples_leaf : 자식 노드로 분할되기 위한 최소 샘플 개수를 설정max_features : 각 노드를 분할할 때 후보로 고려할 특성 개수를 설정ccp_alpha : 가지치기 비용 복잡도 파라미터 → 사후 가지치기를 수행할 때 사용model_full = DecisionTreeClassifier(min_samples_split=100, random_state=0)
model_full.fit(X=X_train, y=y_train)
model_full.score(X=X_train, y=y_train)
# 0.8445157526254375
model_full.score(X=X_valid, y=y_valid)
# 0.791156462585034
pd.Series(data=model_full.feature_importances_, index=model_full.feature_names_in_).sort_values(ascending=False)
# alcohol 0.532137
# volatile acidity 0.101741
# pH 0.067792
# residual sugar 0.067128
# chlorides 0.058272
# density 0.055429
# sulphates 0.049044
# total sulfur dioxide 0.034607
# fixed acidity 0.033050
# citric acid 0.000800
# dtype: float64
hds.plot.feature_importance(model_full)

model_full.get_n_leaves()
# np.int64(62)
model_full.get_depth()
# 13
from sklearn.tree import plot_tree
plt.figure(figsize=(12, 6))
plot_tree(model_full, feature_names=X_train.columns, class_names=model_full.classes_.astype(str), filled=True)
plt.show()

hds.plot.tree(model=model_full, fileName='dtc_full')
from IPython.display import Image
Image('dtc_full.png')

path = model_full.cost_complexity_pruning_path(X_train, y_train)
path = pd.DataFrame(path)
path.head()
# ccp_alphas impurities
# 0 0.000000 0.211743
# 1 0.000022 0.211765
# 2 0.000032 0.211797
# 3 0.000049 0.211845
# 4 0.000054 0.211900
from sklearn.base import clone
def clone_tree(alpha):
model = clone(model_full)
model.set_params(ccp_alpha=alpha)
model.fit(X_train, y_train)
return model
trees = [clone_tree(alpha) for alpha in path['ccp_alphas']]
path['leaves'] = [tree.get_n_leaves() for tree in trees]
path['tr_acc'] = [tree.score(X_train, y_train) for tree in trees]
path['vl_acc'] = [tree.score(X_valid, y_valid) for tree in trees]
path.head()
# ccp_alphas impurities leaves tr_acc vl_acc
# 0 0.000000 0.211743 62 0.844516 0.791156
# 1 0.000022 0.211765 61 0.844516 0.791156
# 2 0.000032 0.211797 60 0.844516 0.791156
# 3 0.000049 0.211845 59 0.844516 0.791156
# 4 0.000054 0.211900 58 0.844516 0.791156
hds.plot.step(data=path, x='ccp_alphas', y='tr_acc', color='red')
hds.plot.step(data=path, x='ccp_alphas', y='vl_acc', color='blue')

np.argmax() 는 최댓값이 여러 개 있으면 맨 처음 인덱스를 반환np.argmax(path['vl_acc'])
# np.int64(41)
indices = np.argsort(path['vl_acc'])
indices.iloc[-1]
# np.int64(42)
best_alpha = path['ccp_alphas'][indices.iloc[-1]]
# np.float64(0.0077337653548115864)
model_prun = clone(model_full)
model_prun.set_params(ccp_alpha=best_alpha)
model_prun.fit(X_train, y_train)
model_prun.score(X_train, y_train)
# 0.8042590431738623
model_prun.score(X_valid, y_valid)
# 0.7959183673469388
model_prun.get_depth()
# 2
model_prun.get_n_leaves()
# np.int64(3)
plot_tree(model_prun, feature_names=X_train.columns, class_names=['Good', 'Best'], filled=True)
plt.show()

y_pred_full = model_full.predict(X_valid)
y_pred_prun = model_prun.predict(X_valid)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_full)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_prun)


y_prob_full = model_full.predict_proba(X_valid)
y_prob_prun = model_prun.predict_proba(X_valid)
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_full, color='red')
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_prun, color='blue')

hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob_full, color='red')
hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob_prun, color='blue')

y_valid.value_counts(normalize=True)
# grade
# 0 0.787075
# 1 0.212925
# # Name: proportion, dtype: float64
cutoff = 0.212925
y_pred_best_0 = np.where(y_prob_full[:, 1] >= cutoff, 1, 0)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_best_0)

from sklearn.metrics import f1_score
def valid_f1_score(tree):
y_pred = tree.predict(X=X_valid)
score = f1_score(y_true=y_valid, y_pred=y_pred)
return score
path['vl_f1s'] = [valid_f1_score(tree) for tree in trees]
hds.plot.step(data=path, x='ccp_alphas', y='vl_f1s')
plt.xlim(-0.005, 0.025)
plt.show()

index = np.argsort(path['vl_f1s'])
best_alpha_f1s = path['ccp_alphas'][index.iloc[-1]]
# np.float64(0.0014843053552403666)
model_best_1 = clone(model_full)
model_best_1.set_params(ccp_alpha=best_alpha_f1s)
model_best_1.fit(X_train, y_train)
y_pred_best_1 = model_best_1.predict(X_valid)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_best_1)

from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=5, random_state=0)
X_bal, y_bal = smote.fit_resample(X_train, y_train)
model_best_2 = DecisionTreeClassifier(min_samples_split=100, random_state=0)
model_best_2.fit(X_bal, y_bal)
model_best_2.score(X_bal, y_bal)
# 0.8508019395747856
model_best_2.score(X_valid, y_valid)
# 0.736734693877551
y_pred_best_2 = model_best_2.predict(X_valid)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_best_2)

class_weight 매개변수에 'balanced' 지정하여 학습model_best_3 = clone(model_full)
model_best_3.set_params(class_weight='balanced')
model_best_3.fit(X_train, y_train)
model_best_3.score(X_train, y_train)
# 0.793757292882147
model_best_3.score(X_valid, y_valid)
# 0.717687074829932
y_pred_best_3 = model_best_3.predict(X_valid)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_best_3)

from sklearn.tree import DecisionTreeRegressor
model_full = DecisionTreeRegressor(min_samples_split=30, random_state=0)
model_full.fit(X_train, y_train)
model_full.score(X_train, y_train)
# 0.716395830725133
model_full.score(X_valid, y_valid)
# 0.35409551601258393
pd.Series(data=model_full.feature_importances_, index=model_full.feature_names_in_).sort_values(ascending=False)
# Glucose 0.741621
# BloodPressure 0.062569
# SkinThickness 0.054211
# BMI 0.053741
# Age 0.045612
# Pedigree 0.026888
# Pregnancies 0.015359
# dtype: float64
hds.plot.feature_importance(model_full)

path = model_full.cost_complexity_pruning_path(X_train, y_train)
path = pd.DataFrame(path)
path.head()
# ccp_alphas impurities
# 0 0.000000 2562.532263
# 1 2.634336 2565.166599
# 2 3.450466 2568.617064
# 3 3.849355 2572.466420
# 4 4.250502 2580.967423
from sklearn.base import clone
def clone_tree(alpha):
model = clone(model_full)
model.set_params(ccp_alpha=alpha)
model.fit(X_train, y_train)
return model
trees = [clone_tree(alpha) for alpha in path['ccp_alphas']]
path['leaves'] = [tree.get_n_leaves() for tree in trees]
path['tr_rsq'] = [tree.score(X_train, y_train) for tree in trees]
path['vl_rsq'] = [tree.score(X_valid, y_valid) for tree in trees]
hds.plot.step(data=path, x='ccp_alphas', y='tr_rsq', color='red')
hds.plot.step(data=path, x='ccp_alphas', y='vl_rsq', color='blue')
plt.xlim(-5, 250)
plt.show()

index = np.argsort(path['vl_rsq'])
best_alpha = path['ccp_alphas'][index.iloc[-1]]
# np.float64(38.711664229416954)
model_prun = clone(model_full)
model_prun.set_params(ccp_alpha=best_alpha)
model_prun.fit(X_train, y_train)
y_pred_full = model_full.predict(X_valid)
y_pred_prun = model_prun.predict(X_valid)
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_full)

hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_prun)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(oob_score=True, random_state=0)
model.fit(X_train, y_train)
model.score(X_train, y_train)
# 1.0
model.score(X_valid, y_valid)
# 0.8653061224489796
pd.Series(data=model.feature_importances_, index=model.feature_names_in_).sort_values(ascending=False)
# alcohol 0.181730
# density 0.126302
# volatile acidity 0.094409
# chlorides 0.094020
# pH 0.093747
# total sulfur dioxide 0.090092
# residual sugar 0.088768
# sulphates 0.085994
# citric acid 0.074391
# fixed acidity 0.070548
# dtype: float64
hds.plot.feature_importance(model)

model.oob_score_
# 0.8771878646441074
def oob_score(ntree):
model.set_params(n_estimators=ntree)
model.fit(X_train, y_train)
return model.oob_score_
ntrees = range(1, 101, 10)
oob_acc = [oob_score(ntree) for ntree in ntrees]
sns.lineplot(x=ntrees, y=oob_acc, color='red', linewidth=1)
plt.show()

y_pred = model.predict(X_valid)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred)

y_prob = model.predict_proba(X_valid)
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob)

hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(oob_score=True, random_state=0)
model.fit(X_train, y_train)
model.score(X_train, y_train)
# 0.9202492412051448
model.score(X_valid, y_valid)
# 0.43848038611513096
pd.Series(data=model.feature_importances_, index=model.feature_names_in_).sort_values(ascending=False)
# Glucose 0.591254
# BMI 0.082809
# SkinThickness 0.078039
# Pedigree 0.077711
# BloodPressure 0.073544
# Age 0.070146
# Pregnancies 0.026497
# dtype: float64
hds.plot.feature_importance(model)

def oob_score(ntree):
model.set_params(n_estimators=ntree)
model.fit(X_train, y_train)
return model.oob_score_
ntrees = range(1, 101)
oob_rsq = [oob_score(ntree) for ntree in ntrees]
sns.lineplot(x=ntrees, y=oob_rsq, color='red', linewidth=1)
y_pred = model.predict(X_valid)
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred)
내일부터는 연휴가 시작되니까 그동안 부족했던 내용들과 찾아보려 했던 내용들을 공부해봐야겠다.