다중 모델 동시 평가

GisangLee·2022년 7월 30일
0

my_module

목록 보기
23/33
post-custom-banner

1. 모델, 하이퍼파라미터

models = {
    'LinearRegression': LinearRegression(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'BaggingRegressor' : BaggingRegressor(),
    'XGBRegressor': XGBRegressor()
}


params = {
    'LinearRegression': [{
        'fit_intercept':[True,False],
        'normalize':[True,False],
        'copy_X':[True, False]
    }],

    'RandomForestRegressor': [{
        'n_estimators': [ 50, 60, 80]
    }],

    'KNeighborsRegressor': [{
        'n_neighbors': [2,3,4,5,6]
    }],

    'DecisionTreeRegressor': [{
        'max_depth': [2,4,6,8,10,12]
    }],

    'BaggingRegressor': [{
        'base_estimator': [None, GradientBoostingRegressor(), KNeighborsRegressor()],
        'n_estimators': [20,50,100]
    }],

    'XGBRegressor': [{
        'n_estimators': [50,500]
    }]
}

2. 다중 평가 함수

important_features_list = []
plt.figure(figsize=(20, 12))

def runregressors(X_train, Y_train, X_test, Y_test):

    i_count = 0
    # 총 3 x 2, 6개의 그래프를 생성
    fig, ax = plt.subplots(nrows=3, ncols=2, figsize = (20, 15))
    
    # 각 평가 지표를 저장할 변수들 생성
    result_name = []
    result_summary1 = []
    result_mae = []
    result_mse = []
    result_exp_var = []
    result_r2_score = []
    result_ac_score = []

    for name in models.keys():

        # estimator 와 parameter 를 가져온다
        est = models[name]
        est_params = params[name]

        """
        최적 파라미터 탐색
        """
        # GridSearchCV 생성하여 fitting (cv=5)
        gscv = GridSearchCV(estimator=est, param_grid=est_params, cv=5)
        gscv.fit(X_train, Y_train)
        
        # 가장 좋은 성능 지표를 저장한다
        msg1 = str(gscv.best_estimator_)
        result_summary1.append(msg1)
        result_name.append(name)
        

        # 베스트 모델로 predict
        y_pred = gscv.predict(X_test)

        """
        모델 평가
        """
        ascore =gscv.best_estimator_.score( X_test, Y_test)
        
        # explained_variance_score 적용
        score = explained_variance_score(Y_test, y_pred)

        # mean_absolute_error 적용
        mae = mean_absolute_error(Y_test, y_pred)

        # mean_squared_error 적용
        mse = mean_squared_error(Y_test, y_pred)
        
        # r2_score 적용
        r2 = r2_score(Y_test, y_pred)

        msg2 = "%s: %f (%f)" % (name, score*100, mae*100)
        print(msg2)

        result_mse.append(mse)
        result_mae.append(mae)
        result_exp_var.append(score)
        result_r2_score.append(r2)
        result_ac_score.append(ascore)

        """
        Feature Importance

        RandomForestRegressor, DecisionTreeRegressor, XGBRegressor : 자체 feature_importances_ 내장
        LinearRegression : coef_
        KNeighborsRegressor : permutation_importance ---> importances_mean
        BaggingRegressor : gscv.best_estimator_ 의 feature_importances 

        """

        if name == "LinearRegression":
            # coefficient 가져오기
            important_features = pd.Series(gscv.best_estimator_.coef_ , index=x_pca_cols[:4])

        elif name == "KNeighborsRegressor":
            # permutation_importance 적용
            results = permutation_importance(gscv.best_estimator_, X_train, Y_train, scoring='neg_mean_squared_error')
            # importance
            important_features = pd.Series(results.importances_mean , index=x_pca_cols[:4])

        elif name == "BaggingRegressor":
            feature_importances = np.mean([tree.feature_importances_ for tree in gscv.best_estimator_], axis=0)
            important_features = pd.Series(feature_importances , index=x_pca_cols[:4])

        else:
            important_features = pd.Series(gscv.best_estimator_.feature_importances_ , index=x_pca_cols[:4])
        important_features_list.append(important_features)

        col = i_count%2
        row = i_count//2
        ax[row][col].scatter(Y_test, y_pred)
        ax[row][col].plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'k--', lw=2)
        ax[row][col].set_xlabel('실제값')
        ax[row][col].set_ylabel('예측값')
        ax[row][col].set_title(msg2)
        i_count+=1
            
    plt.show()

    
    result_summary_list = pd.DataFrame({'name': result_name,
                                        'best_estimator': result_summary1,
                                        'R2': result_r2_score,
                                        'MAE': result_mae,
                                        'MSE': result_mse,
                                        'explained variance score': result_exp_var,
                                        'accuracy': result_ac_score})
    return result_summary_list

3. Feature Importance 확인

for i in range(0,4):
    important_features_list[0][i]  = abs(important_features_list[0][i])

fig, ax = plt.subplots(nrows=3, ncols=2, figsize = (20, 15))
i_count = 0
nm = result_summary_list.name.to_list()
for imp_fea in important_features_list:
    col = i_count%2
    row = i_count//2
    imp_fea.sort_values().plot(kind = 'barh', ax = ax[row][col] )
    ax[row][col].set_title(nm[i_count])
    i_count+=1
            
plt.show()

4. 결과

result_summary_list
profile
포폴 및 이력서 : https://gisanglee.github.io/web-porfolio/
post-custom-banner

0개의 댓글