[TIL] 250130

·2025년 1월 30일

TIL

목록 보기
51/88

오늘 프로젝트 진행 내용

Autogluon 학습

from autogluon.tabular import TabularDataset, TabularPredictor

# data_root = 'https://autogluon.s3.amazonaws.com/datasets/Inc/'
train_data = train_data
test_data = X_test


predictor = TabularPredictor(label='Exited').fit(train_data=train_data)
predictions = predictor.predict(X_test)

def get_score(model_name, y_test, y_pred):
    acc = round(accuracy_score(y_test, y_pred), 3)
    f1 = round(f1_score(y_test,y_pred), 3)
    auc_ = round(roc_auc_score(y_test, y_pred), 3)
    print(model_name, 'accuracy: ', acc, 'f1_score: ', f1, 'AUC: ', auc_)
    
get_score('autogluon', y_test, predictions)

# autogluon accuracy:  0.865 
# f1_score:  0.627 
# AUC:  0.744

Optuna 학습

import optuna
from optuna import Trial
from optuna.samplers import TPESampler

import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, accuracy_score


# 모델 선택 (GBM, XGBoost, LightGBM 중 선택)
MODEL_TYPE = "xgboost"  # "gbm", "xgboost", "lightgbm"
REFIT_METRIC = "f1_score"  # refit 기준 (f1_score 또는 accuracy)

# Objective 함수 정의 (F1 Score + Accuracy)
def objective(trial):
    """Optuna를 활용한 하이퍼파라미터 최적화 함수 (F1 Score + Accuracy)"""
    
    # 공통 하이퍼파라미터 설정
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    n_estimators = trial.suggest_int("n_estimators", 50, 500, step=50)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    
    if MODEL_TYPE == "gbm":
        model = GradientBoostingClassifier(
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42
        )
    
    elif MODEL_TYPE == "xgboost":
        model = XGBClassifier(
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            max_depth=max_depth,
            subsample=trial.suggest_float("subsample", 0.5, 1.0),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
            objective="binary:logistic",
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42
        )
    
    elif MODEL_TYPE == "lightgbm":
        model = LGBMClassifier(
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            max_depth=max_depth,
            num_leaves=trial.suggest_int("num_leaves", 10, 100),
            min_child_samples=trial.suggest_int("min_child_samples", 5, 50),
            subsample=trial.suggest_float("subsample", 0.5, 1.0),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
            random_state=42
        )
    
    # 교차 검증 (StratifiedKFold 5-fold)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores, acc_scores = [], []
    
    for train_idx, val_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        
        f1 = f1_score(y_val, y_pred)
        acc = accuracy_score(y_val, y_pred)
        
        f1_scores.append(f1)
        acc_scores.append(acc)
    
    # 다중 평가지표 반환
    return {
        "f1_score": np.mean(f1_scores),
        "accuracy": np.mean(acc_scores)
    }[REFIT_METRIC]  # refit 기준 적용

# Optuna 실행 (다중 평가지표 적용)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, n_jobs=-1)

# 최적의 하이퍼파라미터 출력
print("Best Hyperparameters:", study.best_params)

# 최적 모델 학습 및 최종 평가
best_params = study.best_params
if MODEL_TYPE == "gbm":
    final_model = GradientBoostingClassifier(**best_params, random_state=42)
elif MODEL_TYPE == "xgboost":
    final_model = XGBClassifier(**best_params, random_state=42, objective="binary:logistic", eval_metric="logloss", use_label_encoder=False)
elif MODEL_TYPE == "lightgbm":
    final_model = LGBMClassifier(**best_params, random_state=42)

final_model.fit(X_train, y_train)
y_pred_test = final_model.predict(X_test)

final_f1 = f1_score(y_test, y_pred_test)
final_acc = accuracy_score(y_test, y_pred_test)

print(f"Final Model F1 Score: {final_f1:.4f}")
print(f"Final Model Accuracy: {final_acc:.4f}")

XGBoost

  • Best Hyperparameters: {'learning_rate': 0.06268984733977456, 'n_estimators': 350, 'max_depth': 5, 'subsample': 0.9960160633298945, 'colsample_bytree': 0.9295589390364003}
  • Final Model F1 Score: 0.6386
  • Final Model Accuracy: 0.8662

LightGBM

  • Best Hyperparameters: {'learning_rate': 0.07813884459733288, 'n_estimators': 450, 'max_depth': 5, 'num_leaves': 83, 'min_child_samples': 12, 'subsample': 0.505509951684884, 'colsample_bytree': 0.9832006138545093}
  • Final Model F1 Score: 0.6367
  • Final Model Accuracy: 0.8648
  • GBM은 학습에 1시간 이상 소요되어 중단함. 필요시 추가 학습 시도 예정.

회고

잘한 점

  • Autogluon과 optuna로 좋은 성능을 보이는 모델 학습을 시도해보았다.

개선점

  • 할 일 우선순위를 잘 세우기!
  • 우선순위에 따라 부지런하게 진행하기!

배운 점

  • Autogluon과 optuna라는 유용한 도구를 알게되어 처음 활용해보았다.
  • 아무리 모델이 좋아도 데이터가 좋지 않으면 성능이 쉽게 오르지 않을 수 있다는 것을 경험을 통해 느꼈다..
profile
To Dare is To Do

0개의 댓글