Out of Fold(OOF) 방식의 예측값으로 앙상블
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state = 0)
model = LGBMClassifier(random_state = 456)
lgbm_pred = np.zeros((X_test.shape[0]))
auc_list = []
for tr_idx, val_idx in kf.split(X_train, y_train):
tr_x, tr_y = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
model.fit(tr_x, tr_y)
pred = model.predict_proba(val_x)[:, 1]
auc = roc_auc_score(val_y, pred)
auc_list.append(auc)
sub_pred = np.array(model.predict_proba(X_test)[:, 1]) / 5
lgbm_pred += sub_pred
print(f'{model.__class__.__name__}의 5fold 평균 AUC는 {np.mean(auc_list)}')
Voting ensemble
- 평가지표가 accuracy, recall, precision 등일 경우
- but 권장 안함, 시간이 너무 오래 걸림
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier(
estimators = [(type(clf).__name__, clf) for clf in clfs], voting='hard')
voting.fit(X_train, y_train).score(X_test, y_test)
voting = VotingClassifier(
estimators = [('gbm', GBM), ('mlp', MLP), ('dt', DT)], voting='hard')
voting.fit(X_train, y_train).score(X_test, y_test)
Averaging predictions
- 평가지표가 roc-auc, logloss 등일 경우 사용
- 산술평균, 기하평균, 조화평균, 멱평균(power mean)
- ML_1030_02_power_mean.ipynb
from scipy.stats import gmean
from scipy.stats import hmean
m_mean = pred_df.mean(axis=1)
g_mean = pred_df.apply(lambda row: gmean(row), axis=1)
h_mean = pred_df.apply(lambda row: hmean(row), axis=1)
w_mean = pred_df['col1']*0.4 + pred_df.drop(['col1'], axis=1).mean(axis=1)*0.6
ns_mean = pred_df.pow(1.2).mean(axis=1).pow(1/1.2)
동일 모델 서로 다른 Random_seed 앙상블
random_seeds = np.random.randint(0, 10000, size=10)
preds_dict={}
for seed in tqdm(random_seeds, total=10):
model = GradientBoostingRegressor(random_state=seed)
model.fit(tr_x, tr_y)
pred = model.predict(te_x)
min_val = np.min(np.abs(pred))
max_val = np.max(np.abs(pred))
scaled_pred = (np.abs(pred) - min_val) / (max_val - min_val)
r_pred = 1- scaled_pred
preds_dict[f'col_{seed}] = r_pred
pred_df = pd.DataFrame(preds_dict)
pred_df['y'] = np.where(te_y<10, 1, 0)
temp = pred_df.groupby(['y']).mean().T
temp['diff'] = temp[1] - temp[0]
temp
Stacking
2-layer stacking
from vecstack import stacking
model = clfs
S_train, S_test = stacking(models,
X_train, y_train, X_test,
regression=False,
needs_proba=False,
metric=accuracy_score,
n_folds=5,
stratified=True,
shuffle=True,
random_state=0,
verbose=2)
meta_model = GBM.fit(S_train, y_train)
accuracy_score(y_test, meta_model.predict(S_test))
from sklearn.ensemble import StackingClassifier
estimators = [(type(clf).__name__, clf) for clf in clfs]
stk_clf = StackingClassifier(
estimators=estimators, final_estimator=GBM, cv=5)
stk_clf.fit(X_train, y_train).score(X_test, y_test)
random_seeds = [11, 53, 679, 2020, 3000, 5481, 7447]
train_dict ={}
test_dict ={}
for seed in tqdm(random_seeds, total=10):
model = GradientBoostingRegressor(random_state=seed)
model.fit(tr_x, tr_y)
tr_pred = model.predict(tr_x)
tr_pred_sc = pred_scaling(tr_pred)
train_dict[f'col_{seed}']= tr_pred_sc
te_pred = model.predict(te_x)
te_pred_sc = pred_scaling(te_pred)
train_dict[f'col_{seed}']= te_pred_sc
X_train = pd.DataFrame(train_dict)
X_test = pd.DataFrame(test_dict)
print(X_train.shape, X_test.shape)
meta_model = RandomForestClassifier(random_state=45)
meta_model.fit(X_train, np.where(tr_y<10, 1, 0))
y_pred = meta_model.predict_proba(X_test)
pred_ = np.where(y_pred[:,1] >0.2, 1, 0)
print(classification_report(te_y, pred_)
3-layer stacking
models = clfs
S_train, S_test = stacking(models,
X_train, y_train, X_test,
regression=False,
needs_proba=True,
metric=accuracy_score,
n_folds=3,
stratified=True,
shuffle=True,
random_state=0,
verbose=0)
voting = VotingClassifier(
estimators = [('lr', LR), ('dt',DT), ('knn', KNN)], voting='hard')
voting.fit(S_train, y_train).score(S_test, y_test)
layer_one_estimators = [(type(clf).__name__, clf) for clf in clfs]
voting = VotingClassifier(estimators = [('lr', LR), ('dt', DT), ('knn', KNN)], voting='hard')
stk_clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=voting, cv=5)
stk_clf.fit(X_train, y_train).score(X_test, y_test)
layer_one_estimators = [(type(clf).__name__, clf) for clf in clfs]
layer_two_estimators = [('lr', LR), ('dt', DT), ('knn', KNN)]
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=LR)
stk_clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two, cv=5)
stk_clf.fit(X_train, y_train).score(X_test, y_test)