!pip install optuna
library 설명
Optuna
LGBM Regressor
train and val split
Evaluation Score 어떠한 평가 지표를 사용하냐에 따라 바꿔준다.
# Optuna Libraries
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
# LGBM Regressor
from lightgbm import LGBMRegressor
# train_test_split
from sklearn.model_selection import train_test_split
# Evaluation Score
from sklearn.metrics import mean_squared_error
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
데이터의 사이즈가 클 경우 30~50사이도 괜찮다.
# random sampler
sampler = TPESampler(seed=10)
# define function
def objective(trial):
lgbm_param = {
'objective': 'regression',
'verbose': -1,
'metric': 'mse',
'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True),
'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
'max_depth': trial.suggest_int('max_depth',3, 15),
'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
}
# Generate model
model_lgbm = LGBMRegressor(**lgbm_param)
model_lgbm = model_lgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)],
verbose=0, early_stopping_rounds=25)
# * 평기 지표이다.
# 원하는 평가 지표에 따라 사용하면 된다.
MSE = mean_squared_error(y_val, model_lgbm.predict(X_val))
return MSE
optuna_lgbm = optuna.create_study(direction='minimize', sampler=sampler)
# * n_trials의 경우 optuna를 몇번 실행하여 hyper parameter를 찾을 것인지를 정한다.
# 50으로 설정해도 유의미한 값이 나온다.
optuna_lgbm.optimize(objective, n_trials=100)
[I 2021-12-19 07:05:55,142] A new study created in memory with name: no-name-d84a892f-dd09-4e10-bb41-a6465c9a5049
[I 2021-12-19 07:05:57,578] Trial 0 finished with value: 36.67533585948705 and parameters: {'num_leaves': 230, 'colsample_bytree': 0.7062255848078204, 'reg_alpha': 0.6336482349262754, 'reg_lambda': 7.488038825386118, 'max_depth': 9, 'learning_rate': 2.2324403996622945e-07, 'n_estimators': 674, 'min_child_samples': 78, 'subsample': 0.46704202331689854}. Best is trial 0 with value: 36.67533585948705.
[I 2021-12-19 07:05:59,078] Trial 1 finished with value: 31.977027811932338 and parameters: {'num_leaves': 3, 'colsample_bytree': 0.9056079455103392, 'reg_alpha': 0.9533933461949365, 'reg_lambda': 0.039482663279144514, 'max_depth': 9, 'learning_rate': 0.0007511484404875245, 'n_estimators': 1876, 'min_child_samples': 74, 'subsample': 0.5226478358414336}. Best is trial 1 with value: 31.977027811932338.
[I 2021-12-19 07:06:03,752] Trial 2 finished with value: 35.305493699882 and parameters: {'num_leaves': 599, 'colsample_bytree': 0.9143727350193072, 'reg_alpha': 0.5425443680112613, 'reg_lambda': 1.4217004760152696, 'max_depth': 7, 'learning_rate': 0.00011086684542255971, 'n_estimators': 1381, 'min_child_samples': 46, 'subsample': 0.7045213978206539}. Best is trial 1 with value: 31.977027811932338.
[I 2021-12-19 07:06:06,842] Trial 3 finished with value: 29.735255721276893 and parameters: {'num_leaves': 43, 'colsample_bytree': 0.8951191545794401, 'reg_alpha': 0.6010389534045444, 'reg_lambda': 8.052231968327465, 'max_depth': 9, 'learning_rate': 0.00283069634962167, 'n_estimators': 1026, 'min_child_samples': 13, 'subsample': 0.5268907502792566}. Best is trial 3 with value: 29.735255721276893.
[I 2021-12-19 07:06:07,370] Trial 4 finished with value: 33.82708945842396 and parameters: {'num_leaves': 3, 'colsample_bytree': 0.948604397892303, 'reg_alpha': 0.04689631938924976, 'reg_lambda': 6.262871483113925, 'max_depth': 10, 'learning_rate': 0.0008236106883720722, 'n_estimators': 677, 'min_child_samples': 87, 'subsample': 0.5520731753100153}. Best is trial 3 with value: 29.735255721276893.
[I 2021-12-19 07:06:08,124] Trial 5 finished with value: 36.66879824854725 and parameters: {'num_leaves': 207, 'colsample_bytree': 0.7887885120639035, 'reg_alpha': 0.8839364795611863, 'reg_lambda': 3.255116378322488, 'max_depth': 5, 'learning_rate': 2.2655594541621946e-06, 'n_estimators': 371, 'min_child_samples': 83, 'subsample': 0.4594195026585646}. Best is trial 3 with value: 29.735255721276893.
lgbm_trial = optuna_lgbm.best_trial
lgbm_trial_params = lgbm_trial.params
print('Best Trial: score {},\nparams {}'.format(lgbm_trial.value, lgbm_trial_params))
Best Trial: score 29.258267034510514,
params {'num_leaves': 6, 'colsample_bytree': 0.9618239747082433, 'reg_alpha': 0.7128910450001208, 'reg_lambda': 9.509642161650188, 'max_depth': 6, 'learning_rate': 0.009911018815851144, 'n_estimators': 2421, 'min_child_samples': 12, 'subsample': 0.7323100893074933}
lgbm_trial_params
{'colsample_bytree': 0.9618239747082433,
'learning_rate': 0.009911018815851144,
'max_depth': 6,
'min_child_samples': 12,
'n_estimators': 2421,
'num_leaves': 6,
'reg_alpha': 0.7128910450001208,
'reg_lambda': 9.509642161650188,
'subsample': 0.7323100893074933}
# Modeling fit
lgbm = LGBMRegressor(**lgbm_trial_params)
lgbm_study = lgbm.fit(X_train, y_train)
# Predict the y_test
submission['y_test'] = lgbm_study.predict(X_test)