출처: 코리 웨이드.(2022). XGBoost와 사이킷런을 활용한 그레이디언트 부스팅. 서울:한빛미디어
데이터를 Train, Test(홀드아웃) 세트로 분할
Train 세트를 다시 Train, Validation 세트로 분할하거나 교차 검증을 사용
최종 모델을 만든 후, Test 세트로 모델 평가
- 모델 평가 점수가 안좋으면 2번으로 돌아가서 반복
pd.to_datetime(unit='ns')
단위(unit) default값 ‘ns’(나노초) df['ns_date'] = pd.to_datetime(df['time_stamp'])
df['ms_date'] = pd.to_datetime(df['time_stamp'], unit='ms')
df[['time_stamp', 'ns_date', 'ms_date']].head(3)
df['month'] = df['date'].dt.month
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek # 0: Monday, 6: Sunday
def weekend(row):
if row['dayofweek'] in [5,6]:
return 1
else:
return 0
df['weekend'] = df.apply(weekend, axis=1)
rush hour: 오전 6~10, 오후 3~7시
def rush_hour(row):
if (row['hour'] in [6,7,8,9,15,16,17,18]) & (row['weekend'] == 0):
return 1
else:
return 0
df['rush_hour'] = df.apply(rush_hour, axis=1)
pd.get_dummies()
로 범주형column을 수치형 column으로 변환 가능‘cab_type’
변수의 빈도를 백분율로 변환
df['cab_freq'] = df.groupby('cab_type')['cab_type'].transform(lambda x: x.count() / len(df))
from category_encoders.target_encoder import TargetEncoder
encoder = TargetEncoder()
df['cab_type_mean'] = encoder.fit_transform(df['cab_type'], df['price'])
# df.groupby('cab_type')['price'].mean() # 값 확인 용도
TargetEncoder 클래스 없이도 변환할 수 있음
df['cab_type_mean2'] = df.groupby('cab_type')['price'].transform('mean')
# len(df) == sum(df['cab_type_mean'] == df['cab_type_mean2']) # True
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
def y_pred(model):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = accuracy_score(y_pred, y_test)
return y_pred
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
y_pred_gbtree = y_pred(XGBClassifier())
y_pred_dart = y_pred(XGBClassifier(booster='dart', one_drop=True))
y_pred_forest = y_pred(RandomForestClassifier(random_state=2))
y_pred_logistic = y_pred(LogisticRegression(max_iter=10000))
y_pred_xgb = y_pred(XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1))
df_pred = pd.DataFrame(data= np.c_[y_pred_gbtree, y_pred_dart,
y_pred_forest, y_pred_logistic, y_pred_xgb],
columns=['gbtree', 'dart', 'forest', 'logistic', 'xgb'])
df_pred.corr()
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
estimators = []
logistic_model = LogisticRegression(max_iter=10000)
estimators.append(('logistic', logistic_model))
xgb_model = XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1)
estimators.append(('xgb', xgb_model))
rf_model = RandomForestClassifier(random_state=2)
estimators.append(('rf', rf_model))
ensemble = VotingClassifier(estimators)
scores = cross_val_score(ensemble, X, y, cv=kfold)
print('Voting 결과: %.3f' % scores.mean())
print('Xgb(최고 모델) 결과: %.3f' % accuracy_score(y_pred_xgb, y_test))
# Voting 결과: 0.977
# Xgb(최고 모델) 결과: 0.965
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
# 데이터 준비
X, y = load_breast_cancer(return_X_y=True)
kfold = StratifiedKFold(n_splits=5)
# 기본 모델
base_models = []
base_models.append(('lr', LogisticRegression()))
base_models.append(('xgb', XGBClassifier()))
base_models.append(('rf', RandomForestClassifier(random_state=2)))
# 메타 모델
meta_model = LogisticRegression()
# 스태킹 앙상블
clf = StackingClassifier(estimators=base_models, final_estimator=meta_model)
scores = cross_val_score(clf, X, y, cv=kfold)
print('Stacking 결과: %.3f' % scores.mean()) # Stacking 결과: 0.981