- max_depth ↓ -> 개별 decision tree 복잡도 ↓ / 보통 5가 넘지 않게 설정
- estimators: 가용시간, 메모리 한도에 맞춰 크게 설정
- 적절한 learning_rate 찾기
#데이터 로드/ 나누기
from dataset import get_breast_cancer_dataset
(X_train, X_test, y_train, y_test), feature_names = get_breast_cancer_dataset()
#모델 생성 학습 평가
from sklearn.ensemble import GradientBoostingClassifier
#learning_rate:0.1, n_estimators=100
gbc = GradientBoostingClassifier(random_state=0)
gbc.fit(X_train, y_train)
pred_train = gbc.predict(X_train)
pred_test = gbc.predict(X_test)
proba_train = gbc.predict_proba(X_train)
proba_test = gbc.predict_proba(X_test)
from metrics import print_metrics_classification
print_metrics_classification(y_train, pred_train, proba_train[:, 1], "Trainset")
print_metrics_classification(y_test, pred_test, proba_test[:, 1], "Testset")
#feature 중요도 조회
import pandas as pd
fi = pd.Series(gbc.feature_importances_, index=feature_names)
fi.sort_values(ascending=False)
gbc2 = GradientBoostingClassifier(n_estimators=n_estimators,
learning_rate=lr,
max_depth=max_depth,
random_state=0)
from xgboost import XGBClassifier, XGBRegressor
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.01, max_depth=1, random_state=0)
xgb.fit(X_train, y_train)
매개변수
#import
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from dataset import get_breast_cancer_dataset
from metrics import print_metrics_classification
#데이터
(X_train, X_test, y_train, y_test), feature_names = get_breast_cancer_dataset()
# 모델 생성
## SVM, KNN -> pipeline
### 각 모델은 튜닝까지 끝난 모델이라고 가정. (최고의 성능을 내도록 튜닝.)
knn = Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=5))])
svm = Pipeline(steps=[("scaler", StandardScaler()),
("svm", SVC(random_state=0, probability=True))]) #soft voting하려면 probability=True 설정.
rfc = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
xgb = XGBClassifier(n_estimators=500, learning_rate=0.01, max_depth=1, random_state=0)
model_list = [
("knn", knn),
("svm", svm),
("RandomForest", rfc),
("XGBoost", xgb)
]
test_predict_dict = {} #test set에 대한 모델들의 추정결과 저장
for name, model in model_list:
# 학습
model.fit(X_train, y_train)
# 평가
## 추정
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
test_predict_dict[name] = pred_test
## 검증
print_metrics_classification(y_train, pred_train, title=f"{name}-Train set")
print_metrics_classification(y_test, pred_test, title=f"{name}-Test set")
print("\n++++++++++++++++++++++++++\n")
estimators = [
("svm", svm),
("knn", knn),
("random forest", rfc)
]
voting = VotingClassifier(estimators=estimators) # default: hard voting
voting.fit(X_train, y_train)
print_metrics_classification(y_test, voting.predict(X_test), title='test set')
#soft
voting = VotingClassifier(estimators=estimators, voting="soft") # default: soft voting
voting.fit(X_train, y_train)
print_metrics_classification(y_test, voting.predict(X_test), title='test set')
.corr > 상관계수(예측비율 보여줌)
매개변수
-estimators : 앙상블할 모델들 설정
#데이터
from dataset import get_boston_dataset
X_train, X_test, y_train, y_test = get_boston_dataset()
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
# LinearRegression, KNN - feature scaling 필요
lr = make_pipeline(StandardScaler(), LinearRegression())
rfr = RandomForestRegressor(max_depth=3, random_state=0)
knn = make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=5))
# 개별모델 학습 -> 평가결과
from metrics import print_metrics_regression
model_list = [
('Linear Regression', lr),
('Random Forest', rfr),
('KNN', knn)
]
for name, model in model_list:
model.fit(X_train, y_train)
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
print_metrics_regression(y_train, pred_train, f"{name} - trainset")
print_metrics_regression(y_test, pred_test, f"{name} - testset")
print("----------------------------------------------------")
voting_reg = VotingRegressor(estimators=model_list)
voting_reg.fit(X_train, y_train)
print_metrics_regression(y_test, voting_reg.predict(X_test))