Ensemble

wandajeong·2021년 12월 28일
0

Machine Learning

목록 보기
8/15
post-custom-banner

Out of Fold(OOF) 방식의 예측값으로 앙상블

from lightgbm import LGBMClassifier 
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state = 0)

model = LGBMClassifier(random_state = 456)
lgbm_pred = np.zeros((X_test.shape[0]))
auc_list = []
for tr_idx, val_idx in kf.split(X_train, y_train):
    tr_x, tr_y = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
    val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
    
    model.fit(tr_x, tr_y)
    pred = model.predict_proba(val_x)[:, 1]

    auc = roc_auc_score(val_y, pred)
    auc_list.append(auc)
    
    sub_pred = np.array(model.predict_proba(X_test)[:, 1]) / 5
    lgbm_pred += sub_pred
print(f'{model.__class__.__name__}의 5fold 평균 AUC는 {np.mean(auc_list)}')

Voting ensemble

  • 평가지표가 accuracy, recall, precision 등일 경우
  • but 권장 안함, 시간이 너무 오래 걸림
from sklearn.ensemble import VotingClassifier  

# 위에서 평가한 모든 모델을 앙상블할 경우
voting = VotingClassifier(
    estimators = [(type(clf).__name__, clf) for clf in clfs], voting='hard')
voting.fit(X_train, y_train).score(X_test, y_test)

# 가장 성능이 높은 GBM과 낮은 상관관계가 있는 모델끼리만 앙상블할 경우
voting = VotingClassifier(
    estimators = [('gbm', GBM), ('mlp', MLP), ('dt', DT)], voting='hard')   
    #앙상블할 모델을 리스트로 넣는다.
#hard voting 다수결, #soft voting은 prediction의 평균값 
voting.fit(X_train, y_train).score(X_test, y_test)

Averaging predictions

  • 평가지표가 roc-auc, logloss 등일 경우 사용
  • 산술평균, 기하평균, 조화평균, 멱평균(power mean)
  • ML_1030_02_power_mean.ipynb
from scipy.stats import gmean
from scipy.stats import hmean

m_mean = pred_df.mean(axis=1) 
g_mean = pred_df.apply(lambda row: gmean(row), axis=1)
h_mean = pred_df.apply(lambda row: hmean(row), axis=1)
w_mean = pred_df['col1']*0.4 + pred_df.drop(['col1'], axis=1).mean(axis=1)*0.6
ns_mean = pred_df.pow(1.2).mean(axis=1).pow(1/1.2)

동일 모델 서로 다른 Random_seed 앙상블

random_seeds = np.random.randint(0, 10000, size=10)
preds_dict={}
for seed in tqdm(random_seeds, total=10):
	model = GradientBoostingRegressor(random_state=seed)
    model.fit(tr_x, tr_y)
    
    pred = model.predict(te_x)
    
    min_val = np.min(np.abs(pred))
    max_val = np.max(np.abs(pred))
    scaled_pred = (np.abs(pred) - min_val) / (max_val - min_val)
    r_pred = 1- scaled_pred
    
    preds_dict[f'col_{seed}] = r_pred
    
pred_df = pd.DataFrame(preds_dict)
pred_df['y'] = np.where(te_y<10, 1, 0)
temp = pred_df.groupby(['y']).mean().T
temp['diff'] = temp[1] - temp[0]
temp    # diff가 큰 random seed를 선택한다. 

Stacking

2-layer stacking

  • using vecstack
from vecstack import stacking

model = clfs
S_train, S_test = stacking(models,     #list of models
			X_train, y_train, X_test,
                        regression=False,
                        needs_proba=False,
                        metric=accuracy_score,
                        n_folds=5, 
                        stratified=True,
                        shuffle=True, 
                        random_state=0,
                        verbose=2)  #print all info
meta_model = GBM.fit(S_train, y_train)
accuracy_score(y_test, meta_model.predict(S_test))
#S_train, S_test는 앙상블 모델 수 만큼 칼럼이 생성된다 
  • using sklearn
from sklearn.ensemble import StackingClassifier

# 2-layer stacking

estimators = [(type(clf).__name__, clf) for clf in clfs]
stk_clf = StackingClassifier(
    estimators=estimators, final_estimator=GBM, cv=5)

stk_clf.fit(X_train, y_train).score(X_test, y_test)
  • 직접
# 동일 모델의 서로 다른 random_seed로 stacking ensemble
random_seeds = [11, 53, 679, 2020, 3000, 5481, 7447]
train_dict ={}
test_dict ={}
for seed in tqdm(random_seeds, total=10):
	model = GradientBoostingRegressor(random_state=seed)
    model.fit(tr_x, tr_y)
    
    tr_pred = model.predict(tr_x)
    tr_pred_sc = pred_scaling(tr_pred)
    train_dict[f'col_{seed}']= tr_pred_sc
    
    te_pred = model.predict(te_x)
    te_pred_sc = pred_scaling(te_pred)
    train_dict[f'col_{seed}']= te_pred_sc
    
X_train = pd.DataFrame(train_dict)
X_test = pd.DataFrame(test_dict)
print(X_train.shape, X_test.shape)

meta_model = RandomForestClassifier(random_state=45)
meta_model.fit(X_train, np.where(tr_y<10, 1, 0))
y_pred = meta_model.predict_proba(X_test)

pred_ = np.where(y_pred[:,1] >0.2, 1, 0)
print(classification_report(te_y, pred_)    

3-layer stacking

  • using vecstack
# level-1: LR, DT, MLP, KNN, RF, GBM

models = clfs
S_train, S_test = stacking(models,                     # list of models
                           X_train, y_train, X_test,   # data
                           regression=False,           # classification task (if you need 
                                                       #     regression - set to True)
                           needs_proba=True,           # predict class labels (if you need 
                                                       #     probabilities - set to True) 
                           metric=accuracy_score,      # metric: callable
                           n_folds=3,                  # number of folds
                           stratified=True,            # stratified split for folds
                           shuffle=True,               # shuffle the data
                           random_state=0,             # ensure reproducibility
                           verbose=0)                  # print all info
                           
# level-2: LR, DT, KNN
# Level-3: Voting
voting = VotingClassifier(
	estimators = [('lr', LR), ('dt',DT), ('knn', KNN)], voting='hard')
voting.fit(S_train, y_train).score(S_test, y_test)
  • using sklearn
# 3-layer stacking (Level-3: Voting)

layer_one_estimators = [(type(clf).__name__, clf) for clf in clfs]
voting = VotingClassifier(estimators = [('lr', LR), ('dt', DT), ('knn', KNN)], voting='hard')
stk_clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=voting, cv=5)

stk_clf.fit(X_train, y_train).score(X_test, y_test)

# 3-layer stacking (Level-3: LR)

layer_one_estimators = [(type(clf).__name__, clf) for clf in clfs]
layer_two_estimators = [('lr', LR), ('dt', DT), ('knn', KNN)]

layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=LR)
stk_clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two, cv=5)

stk_clf.fit(X_train, y_train).score(X_test, y_test)
profile
ML/DL swimmer
post-custom-banner

0개의 댓글