1. 모델, 하이퍼파라미터
models = {
'LinearRegression': LinearRegression(),
'RandomForestRegressor': RandomForestRegressor(),
'KNeighborsRegressor': KNeighborsRegressor(),
'DecisionTreeRegressor':DecisionTreeRegressor(),
'BaggingRegressor' : BaggingRegressor(),
'XGBRegressor': XGBRegressor()
}
params = {
'LinearRegression': [{
'fit_intercept':[True,False],
'normalize':[True,False],
'copy_X':[True, False]
}],
'RandomForestRegressor': [{
'n_estimators': [ 50, 60, 80]
}],
'KNeighborsRegressor': [{
'n_neighbors': [2,3,4,5,6]
}],
'DecisionTreeRegressor': [{
'max_depth': [2,4,6,8,10,12]
}],
'BaggingRegressor': [{
'base_estimator': [None, GradientBoostingRegressor(), KNeighborsRegressor()],
'n_estimators': [20,50,100]
}],
'XGBRegressor': [{
'n_estimators': [50,500]
}]
}
2. 다중 평가 함수
important_features_list = []
plt.figure(figsize=(20, 12))
def runregressors(X_train, Y_train, X_test, Y_test):
i_count = 0
fig, ax = plt.subplots(nrows=3, ncols=2, figsize = (20, 15))
result_name = []
result_summary1 = []
result_mae = []
result_mse = []
result_exp_var = []
result_r2_score = []
result_ac_score = []
for name in models.keys():
est = models[name]
est_params = params[name]
"""
최적 파라미터 탐색
"""
gscv = GridSearchCV(estimator=est, param_grid=est_params, cv=5)
gscv.fit(X_train, Y_train)
msg1 = str(gscv.best_estimator_)
result_summary1.append(msg1)
result_name.append(name)
y_pred = gscv.predict(X_test)
"""
모델 평가
"""
ascore =gscv.best_estimator_.score( X_test, Y_test)
score = explained_variance_score(Y_test, y_pred)
mae = mean_absolute_error(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)
msg2 = "%s: %f (%f)" % (name, score*100, mae*100)
print(msg2)
result_mse.append(mse)
result_mae.append(mae)
result_exp_var.append(score)
result_r2_score.append(r2)
result_ac_score.append(ascore)
"""
Feature Importance
RandomForestRegressor, DecisionTreeRegressor, XGBRegressor : 자체 feature_importances_ 내장
LinearRegression : coef_
KNeighborsRegressor : permutation_importance ---> importances_mean
BaggingRegressor : gscv.best_estimator_ 의 feature_importances
"""
if name == "LinearRegression":
important_features = pd.Series(gscv.best_estimator_.coef_ , index=x_pca_cols[:4])
elif name == "KNeighborsRegressor":
results = permutation_importance(gscv.best_estimator_, X_train, Y_train, scoring='neg_mean_squared_error')
important_features = pd.Series(results.importances_mean , index=x_pca_cols[:4])
elif name == "BaggingRegressor":
feature_importances = np.mean([tree.feature_importances_ for tree in gscv.best_estimator_], axis=0)
important_features = pd.Series(feature_importances , index=x_pca_cols[:4])
else:
important_features = pd.Series(gscv.best_estimator_.feature_importances_ , index=x_pca_cols[:4])
important_features_list.append(important_features)
col = i_count%2
row = i_count//2
ax[row][col].scatter(Y_test, y_pred)
ax[row][col].plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'k--', lw=2)
ax[row][col].set_xlabel('실제값')
ax[row][col].set_ylabel('예측값')
ax[row][col].set_title(msg2)
i_count+=1
plt.show()
result_summary_list = pd.DataFrame({'name': result_name,
'best_estimator': result_summary1,
'R2': result_r2_score,
'MAE': result_mae,
'MSE': result_mse,
'explained variance score': result_exp_var,
'accuracy': result_ac_score})
return result_summary_list
3. Feature Importance 확인
for i in range(0,4):
important_features_list[0][i] = abs(important_features_list[0][i])
fig, ax = plt.subplots(nrows=3, ncols=2, figsize = (20, 15))
i_count = 0
nm = result_summary_list.name.to_list()
for imp_fea in important_features_list:
col = i_count%2
row = i_count//2
imp_fea.sort_values().plot(kind = 'barh', ax = ax[row][col] )
ax[row][col].set_title(nm[i_count])
i_count+=1
plt.show()
4. 결과
result_summary_list