오늘 프로젝트 진행 내용
Autogluon 학습
from autogluon.tabular import TabularDataset, TabularPredictor
train_data = train_data
test_data = X_test
predictor = TabularPredictor(label='Exited').fit(train_data=train_data)
predictions = predictor.predict(X_test)
def get_score(model_name, y_test, y_pred):
acc = round(accuracy_score(y_test, y_pred), 3)
f1 = round(f1_score(y_test,y_pred), 3)
auc_ = round(roc_auc_score(y_test, y_pred), 3)
print(model_name, 'accuracy: ', acc, 'f1_score: ', f1, 'AUC: ', auc_)
get_score('autogluon', y_test, predictions)
Optuna 학습
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, accuracy_score
MODEL_TYPE = "xgboost"
REFIT_METRIC = "f1_score"
def objective(trial):
"""Optuna를 활용한 하이퍼파라미터 최적화 함수 (F1 Score + Accuracy)"""
learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
n_estimators = trial.suggest_int("n_estimators", 50, 500, step=50)
max_depth = trial.suggest_int("max_depth", 3, 10)
if MODEL_TYPE == "gbm":
model = GradientBoostingClassifier(
learning_rate=learning_rate,
n_estimators=n_estimators,
max_depth=max_depth,
random_state=42
)
elif MODEL_TYPE == "xgboost":
model = XGBClassifier(
learning_rate=learning_rate,
n_estimators=n_estimators,
max_depth=max_depth,
subsample=trial.suggest_float("subsample", 0.5, 1.0),
colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
objective="binary:logistic",
eval_metric="logloss",
use_label_encoder=False,
random_state=42
)
elif MODEL_TYPE == "lightgbm":
model = LGBMClassifier(
learning_rate=learning_rate,
n_estimators=n_estimators,
max_depth=max_depth,
num_leaves=trial.suggest_int("num_leaves", 10, 100),
min_child_samples=trial.suggest_int("min_child_samples", 5, 50),
subsample=trial.suggest_float("subsample", 0.5, 1.0),
colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
random_state=42
)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores, acc_scores = [], []
for train_idx, val_idx in kf.split(X_train, y_train):
X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
model.fit(X_tr, y_tr)
y_pred = model.predict(X_val)
f1 = f1_score(y_val, y_pred)
acc = accuracy_score(y_val, y_pred)
f1_scores.append(f1)
acc_scores.append(acc)
return {
"f1_score": np.mean(f1_scores),
"accuracy": np.mean(acc_scores)
}[REFIT_METRIC]
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, n_jobs=-1)
print("Best Hyperparameters:", study.best_params)
best_params = study.best_params
if MODEL_TYPE == "gbm":
final_model = GradientBoostingClassifier(**best_params, random_state=42)
elif MODEL_TYPE == "xgboost":
final_model = XGBClassifier(**best_params, random_state=42, objective="binary:logistic", eval_metric="logloss", use_label_encoder=False)
elif MODEL_TYPE == "lightgbm":
final_model = LGBMClassifier(**best_params, random_state=42)
final_model.fit(X_train, y_train)
y_pred_test = final_model.predict(X_test)
final_f1 = f1_score(y_test, y_pred_test)
final_acc = accuracy_score(y_test, y_pred_test)
print(f"Final Model F1 Score: {final_f1:.4f}")
print(f"Final Model Accuracy: {final_acc:.4f}")
XGBoost
- Best Hyperparameters: {'learning_rate': 0.06268984733977456, 'n_estimators': 350, 'max_depth': 5, 'subsample': 0.9960160633298945, 'colsample_bytree': 0.9295589390364003}
- Final Model F1 Score: 0.6386
- Final Model Accuracy: 0.8662
LightGBM
- Best Hyperparameters: {'learning_rate': 0.07813884459733288, 'n_estimators': 450, 'max_depth': 5, 'num_leaves': 83, 'min_child_samples': 12, 'subsample': 0.505509951684884, 'colsample_bytree': 0.9832006138545093}
- Final Model F1 Score: 0.6367
- Final Model Accuracy: 0.8648
- GBM은 학습에 1시간 이상 소요되어 중단함. 필요시 추가 학습 시도 예정.
회고
잘한 점
- Autogluon과 optuna로 좋은 성능을 보이는 모델 학습을 시도해보았다.
개선점
- 할 일 우선순위를 잘 세우기!
- 우선순위에 따라 부지런하게 진행하기!
배운 점
- Autogluon과 optuna라는 유용한 도구를 알게되어 처음 활용해보았다.
- 아무리 모델이 좋아도 데이터가 좋지 않으면 성능이 쉽게 오르지 않을 수 있다는 것을 경험을 통해 느꼈다..