
Santnadler Customer Satisfaction


111๊ฐ์ ํผ์ฒ๊ฐ floatํ, 260๊ฐ์ ํผ์ฒ๊ฐ int ํ์ผ๋ก ๋ชจ๋ ํผ์ฒ๊ฐ ์ซ์ํ์ด๋ฉฐ, Null ๊ฐ์ ์๋ค. ๋ ์ด๋ธ Target์์ ๋๋ถ๋ถ์ด ๋ง์กฑ์ด๋ฉฐ ๋ถ๋ง์กฑ์ธ ๊ณ ๊ฐ์ 4%์ ๋ถ๊ณผํ๋ค.
min๊ฐ์ -999999์ ์ต๋ค๊ฐ 2๋ก ๋ณํํ๊ณ ID ํผ์ฒ๋ ๋จ์ ์๋ณ์์ด๋ฏ๋ก ํผ์ฒ๋ฅผ ๋๋กญํ๋ค.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=0)
train_cnt = y_train.count()
test_cnt = y_test.count()
print('ํ์ต ์ธํธ Shape:{0}, ํ
์คํธ ์ธํธ Shape:{1}'.format(X_train.shape, X_test.shape))
print('ํ์ต ์ธํธ ๋ ์ด๋ธ ๊ฐ ๋ถํฌ ๋น์จ')
print(y_train.value_counts()/train_cnt)
print('\nํ
์คํธ ์ธํธ ๋ ์ด๋ธ ๊ฐ ๋ถํฌ ๋น์จ')
print(y_test.value_counts()/test_cnt)
#XGBoost ์กฐ๊ธฐ์ค๋จ์ ์ํด ๊ฒ์ฆ ์ธํธ ๋ถ๋ฆฌ
#X_train, y_train์ ๋ค์ ํ์ต๊ณผ ๊ฒ์ฆ ๋ฐ์ดํฐ ์ธํธ๋ก ๋ถ๋ฆฌ
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = 0.3, random_state=0)
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
#n_estimators๋ 500์ผ๋ก, random state๋ ์์ ์ํ ์๋ง๋ค ๋์ผ ์์ธก ๊ฒฐ๊ณผ๋ฅผ ์ํด ์ค์
xgb_clf = XGBClassifier(n_estimators=500, learning_rate=0.05, random_state =156)
#์ฑ๋ฅ ํ๊ฐ ์งํ๋ฅผ auc๋กใ
ก ์กฐ๊ธฐ ์ค๋จ ํ๋ผ๋ฏธํฐ๋ 100์ผ๋ก ์ค์ ํ๊ณ ํ์ต ์ํ
xgb_clf.fit(X_tr, y_tr, early_stopping_rounds=100, eval_metric="auc", eval_set = [(X_tr, y_tr), (X_val, y_val)])
xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:,1])
print('ROC AUC: {0:.4f}'.format(xgb_roc_score))
[Output]
ROC AUC: 0.8429
3 Fold ๊ต์ฐจ ๊ฒ์ฆ์ ์ด์ฉํด ํ๊ท ROC-AUC ๊ฐ ๋ฐํ (-1*ROC-AUC)
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
# fmin()์์ ํธ์ถ์ search_space ๊ฐ์ผ๋ก XGBClassifier ๊ต์ฐจ ๊ฒ์ฆ ํ์ต . ํ -1*roc_auc ํ๊ท ๊ฐ์ ๋ฐํ
# ๋ชฉ์ ํจ์
def objective_func(search_space):
xgb_clf = XGBClassifier(n_estimators=100, max_depth=int(search_space['max_depth']),
min_child_weight= int(search_space['min_child_weight']),
colsample_bytree = search_space['colsample_bytree'],
learning_rate = search_space['learning_rate'])
#3๊ฐ k-fold ๋ฐฉ์์ผ๋ก ํ๊ฐ๋ roc_auc ์งํ๋ฅผ ๋ด๋ list
roc_auc_list = []
#3๊ฐ k-fold ๋ฐฉ์ ์ ์ฉ
kf = KFold(n_splits=3)
# Xtrain์ ๋ค์ ํ์ต๊ณผ ๊ฒ์ฆ์ฉ ๋ฐ์ดํฐ๋ก ๋ถ๋
for tr_index, val_index in kf.split(X_train):
#kf.split(X_train)์ผ๋ก ์ถ์ถ๋ ํ์ต๊ณผ ๊ฒ์ฆ index ๊ฐ์ผ๋ก ํ์ต๊ณผ ๊ฒ์ฆ ๋ฐ์ดํฐ ์ธํธ ๋ถ๋ฆฌ
X_tr, y_tr = X_train.iloc[tr_index], y_train.iloc[tr_index]
X_val, y_val = X_train.iloc[val_index], y_train.iloc[val_index]
#early stopping์ 30ํ๋ก ์ค์ ํ๊ณ ์ถ์ถ๋ ํ์ต๊ณผ ๊ฒ์ฆ ๋ฐ์ดํฐ๋ก XGBClassifier ํ์ต ์ํ
xgb_clf.fit(X_tr, y_tr, early_stopping_rounds=30, eval_metric="auc", eval_set=[(X_tr, y_tr), (X_val, y_val)])
#1๋ก ์์ธกํ ํ๋ฅ ๊ฐ ์ถ์ถ ํ roc auc ๊ณ์ฐํ๊ณ ํ๊ท roc auc ๊ณ์ฐ์ ์ํด List์ ๊ฒฐ๊ด๊ฐ ๋ด์.
score = roc_auc_score(y_val, xgb_clf.predict_proba(X_val)[:,1])
roc_auc_list.append(score)
# 3๊ฐ k-fold๋ก ๊ณ์ฐ๋ roc auc ๊ฐ์ ํ๊ท ๊ฐ์ ๋ฐํํ๋,
# HyperOpt๋ ๋ชฉ์ ํจ์์ ์ต์๊ฐ์ ์ํ ์
๋ ฅ๊ฐ์ ์ฐพ์ผ๋ฏ๋ก -1์ ๊ณฑํ ๋ค ๋ฐํ
return -1*np.mean(roc_auc_list)
from hyperopt import fmin, tpe, Trials
trials = Trials()
# fmin() ํจ์๋ฅผ ํธ์ถ. max_evals ์ง์ ๋ ํ์๋งํผ ๋ฐ๋ณต ํ ๋ชฉ์ ํจ์์ ์ต์๊ฐ์ ๊ฐ์ง๋ ์ต์ ์
๋ ฅ๊ฐ ์ถ์ถ.
# ์
๋ ฅ ํ๋ผ๋ฏธํฐ๋ก ์ต์ ์
๋ ฅ๊ฐ์ ์ถ์ถํ๋ fmin
best = fmin(fn=objective_func,
space = xgb_search_space,
algo = tpe.suggest,
max_evals = 50, #์ต๋ ๋ฐ๋ณต ํ์๋ฅผ ์ง์ ํฉ๋๋ค
trials = trials, rstate = np.random.default_rng(seed=30))
print('best:', best)
#30๋ถ ์์...
๋ชฉ์ ๋ฐํ ์ต์๊ฐ์ ๊ฐ์ง๋ ์ต์ ์ ๋ ฅ๊ฐ ์ ์ถ
#n_estimators๋ฅผ 500 ์ฆ๊ฐ ํ ์ต์ ์ผ๋ก ์ฐพ์ ํ์ดํผ ํ๋ผ๋ฏธํฐ๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ํ์ต๊ณผ ์์ธก ์ํ
xgb_clf = XGBClassifier(n_estimators=500, learning_rate = round(best['learning_rate'], 5),
max_depth = int(best['max_depth']), #๊ฐ ํ์ดํผ ํ๋ผ๋ฏธํฐ์ ์ต์ ์
๋ ฅ๊ฐ(best[] list) ๋ฃ๊ธฐ
min_child_weight=int(best['min_child_weight']),
colsample_bytree = round(best['colsample_bytree'], 5)
)
#evaluation metric์ auc๋ก, early stopping์ 100์ผ๋ก ์ค์ ํ๊ณ ํ์ต ์ํ
xgb_clf.fit(X_tr, y_tr, early_stopping_rounds=100,
eval_metric="auc", eval_set=[(X_tr, y_tr), (X_val, y_val)])
xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:,1])
print('ROC AUC: {0:4f}'.format(xgb_roc_score))
from lightgbm import LGBMClassifier
lgbm_clf = LGBMClassifier(n_estimators = 500)
eval_set = [(X_tr, y_tr), (X_val, y_val)]
lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=100, eval_metric="auc", eval_set= eval_set)
lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1])
print('ROC AUC:{0:.4f}'.format(lgbm_roc_score))
lgbm_search_space = {'num_leaves' :hp.quniform('num_leaves', 32, 64, 1),
'max_depth':hp.quniform('max_depth', 100, 160, 1),
'min_child_samples':hp.quniform('min_child_samples', 60, 100,1),
'subsample': hp.uniform('subsample', 0.7, 1),
'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)}
def objective_func(search_space):
lgbm_clf = LGBMClassifier(n_estimators=100,
num_leaves = int(search_space['num_leaves']),
max_depth = int(search_space['max_depth']),
min_child_samples = int(search_space['min_child_samples']),
subsample = search_space['subsample'],
learning_rate = search_space['learning_rate'])
#3๊ฐ k-fold ๋ฐฉ์์ผ๋ก ํ๊ฐ๋ roc_auc ์งํ๋ฅผ ๋ด๋ list
roc_auc_list = []
#3๊ฐ k-fold ๋ฐฉ์ ์ ์ฉ
kf = KFold(n_splits=3)
#X_train์ ๋ค์ ํ์ต๊ณผ ๊ฒ์ฆ์ฉ ๋ฐ์ดํฐ๋ก ๋ถ๋ฆฌ
for tr_index, val_index in kf.split(X_train):
#kf.split(X_tain)์ผ๋ก ์ถ์ถ๋ ํ์ต๊ณผ ๊ฒ์ฆ index ๊ฐ์ผ๋ก ํ์ต๊ณผ ๊ฒ์ฆ ๋ฐ์ดํฐ ์ธํธ ๋ถ๋ฆฌ
X_tr, y_tr = X_train.iloc[tr_index], y_train.iloc[tr_index]
X_val, y_val = X_train.iloc[val_index], y_train.iloc[val_index]
#early stopping์ 30ํ๋ก ์ค์ ํ๊ณ ์ถ์ถ๋ ํ์ต๊ณผ ๊ฒ์ฆ ๋ฐ์ดํฐ๋ก XGBClassifer ํ์ต ์ํ
lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=30, eval_metric="auc",
eval_set = [(X_tr, y_tr), (X_val, y_val)])
# 1๋ก ์์ธกํ ํ๋ฅ ๊ฐ ์ถ์ถ ํ roc auc ๊ณ์ฐํ๊ณ ํ๊ท roc auc ๊ณ์ฐ์ ์ํด list์ ๊ฒฐ๊ด๊ฐ ๋ด์.
score = roc_auc_score(y_val, lgbm_clf.predict_proba(X_val)[:,1])
roc_auc_list.append(score)
#3๊ฐ k-fold๋ก ๊ณ์ฐ๋ roc_auc ๊ฐ์ ํ๊ท ๊ฐ์ ๋ฐํํ๋,
#HyperOpt๋ ๋ชฉ์ ํจ์์ ์ต์๊ฐ์ ์ํ ์
๋ ฅ๊ฐ์ ์ฐพ์ผ๋ฏ๋ก -1์ ๊ณฑํ๋ค ๋ฐํ.
return -1*np.mean(roc_auc_list)
(์ต์ ํ์ดํผ ํ๋ผ๋ฏธํฐ๋ก ROC-AUC ํ๊ฐ ์๋ต)
[Output]
ROC AUC : 0.8446
XGBoost์ ์ ์ฌํ ๊ฒฐ๊ณผ๋ฅผ ๋ณด์. LightGBM์ ํ์ต ์๊ฐ์ด XGBoost๋ณด๋ค ๋น ๋ฅด๊ธฐ ๋๋ฌธ์ LightGBM์ผ๋ก ํ๋ จ ํ์.
์ด์ ๋ฐ์ดํฐ ์ ์ฒด ๋ฐ์ดํฐ์ ํจํด์์ ๋ฒ์ด๋ ์ด์ ๊ฐ์ ๊ฐ์ง ๋ฐ์ดํฐ
์ด์ ๋ ์ด๋ธ์ ๊ฐ์ง๋ ๋ฐํฐํฐ ๊ฑด์๋ ๋งค์ฐ ์ ๊ธฐ ๋๋ฌธ์ ์ ๋๋ก ๋ค์ํ ์ ํ ํ์ต์ ๋ชปํ๊ณ ์ ์ ๋ ์ด๋ธ๋ก ์น์ฐ์น ํ์ต์ ์ํํด ์ ๋๋ก ๋ ์ด์ ๋ฐ์ดํฐ ๊ฒ์ถ์ด ์ด๋ ค์์ง
(ํ๊ท ์น๋ก ํ๋ จ๋ ๋ชจ๋ธ์ ์ด์ ๋ฐ์ดํฐ ๊ฒ์ถ์ด ์ด๋ ต๋ค)
์ธ๋ ์ํ๋ง ๋ง์ ๋ฐ์ดํฐ ์ธํธ๋ฅผ ์ ์ ๋ฐ์ดํฐ ์ธํธ ์์ค์ผ๋ก ๊ฐ์์ํค๋ ๋ฐฉ์
๊ณผ๋ํ๊ฒ ์ ์๋ ์ด๋ธ๋ก ํ์ต/์์ธกํ๋ ๋ถ์์ฉ์ ๊ฐ์ ํ ์ ์์ง๋ง, ์ ๋๋ก ๋ ํ์ต์ ์ํํ ์์๋ ๋ฌธ์ ๋ ๋ฐ์ ๊ฐ๋ฅ์ฑ
์ค๋ฒ ์ํ๋ง ์ด์ ๋ฐ์ดํฐ์ ๊ฐ์ด ์ ์ ๋ฐ์ดํฐ ์ธํธ๋ฅผ ์ฆ์ํ์ฌ ํ์ต์ ์ํ ์ถฉ๋ถํ ๋ฐ์ดํฐ๋ฅผ ํ๋ณดํ๋ ๋ฐฉ๋ฒ. ๊ณผ์ ํฉ ๋๊ธฐ ๋๋ฌธ์ ์๋ณธ ๋ฐ์ดํฐ์ ํผ์ฒ ๊ฐ๋ค์ ์์ฃผ ์ฝ๊ฐ๋ง ๋ณ๊ฒฝํ์ฌ ์ฆ์.
SMOTE (์ค๋ฒ์ํ๋ง) k-์ต๊ทผ์ ์ด์์์ ์ด์๋ค์ ์ฐจ์ด๋ฅผ ์ผ์ ๊ฐ์ผ๋ก ๋ง๋ค์ด์ ๊ธฐ์กด ๋ฐ์ดํฐ์ ์ฝ๊ฐ ์ฐจ์ด๊ฐ ๋๋ ์๋ก์ด ๋ฐ์ดํฐ๋ค์ ์์ฑํ๋ ๋ฐฉ์
ํ ์คํธ ์ธํธ๋ฅผ ์ ์ฒด์ 30%์ธ Stratified ๋ฐฉ์์ผ๋ก ์ถ์ถํด ํ์ต ๋ฐ์ดํฐ ์ธํธ์ ํ ์คํธ ๋ฐ์ดํฐ ์ธํธ์ ๋ ์ด๋ธ ๊ฐ ๋ถํฌ๋๋ฅผ ์๋ก ๋์ผํ๊ฒ ๋ง๋ฆ.
Stratified Sampling
์์ ์ถ์ถ์ ๋ฐ์ดํฐ ๋น์จ์ ๋ฐ์ํ์ง ๋ชปํ๋ค๋ ๋จ์ ์ด ์์ด, ๊ณ์ธต ์ถ์ถ์ด ๊ถ์ฅ๋จ
ex) StratifiedShuffleSplit(), StratifiedKFold(), train_test_split()
# ์ฌ์ ๋ฐ์ดํฐ ๊ฐ๊ณต ํ ํ์ต๊ณผ ํ
์คํธ ๋ฐ์ดํฐ ์ธํธ๋ฅผ ๋ฐํํ๋ ํจ์
def get_train_test_dataset(df=None):
#์ธ์๋ก ์
๋ ฅ๋ DataFrame์ ์ฌ์ ๋ฐ์ดํฐ ๊ฐ๊ณต์ด ์๋ฃ๋ ๋ณต์ฌ DataFrame ๋ฐํ
df_copy = get_preprocessed_df(df)
#DataFrame์ ๋งจ ๋ง์ง๋ง ์นผ๋ผ์ด ๋ ์ด๋ธ, ๋๋จธ์ง ํผ์ฒ๋ค
X_features = df_copy.iloc[:, :-1]
y_target = df_copy.iloc[:,-1]
#train_test_split()์ผ๋ก ํ์ต๊ณผ ํ
์คํธ ๋ฐ์ดํฐ ๋ถํ . stratify=y_target์ผ๋ก Stratified ๊ธฐ๋ฐ ๋ถํ
X_train, X_test, y_train, y_test = \
train_test_split(X_features, y_target, test_size =0.3, random_state =0, stratify = y_target)
#ํ์ต๊ณผ ํ
์คํธ ๋ฐ์ดํฐ ์ธํธ ๋ฐํ
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = get_train_test_dataset(card_df)
1) Logistic
[Output]
์ค์ฐจ ํ๋ ฌ
[[85281 14]
[ 58 90]]
์ ํ๋: 0.9992, ์ ๋ฐ๋: 0.8654, ์ฌํ์จ: 0.6081, F1:0.7143, AUC:0.9703
2) LightGBM
[Output]
์ค์ฐจ ํ๋ ฌ
[[85281 14]
[ 58 90]]
์ ํ๋: 0.9992, ์ ๋ฐ๋: 0.8654, ์ฌํ์จ: 0.6081, F1:0.7143, AUC:0.9703
LightGBM์ด ๋ก์ง์คํฑ ํ๊ท๋ณด๋ค ๋์ ์์น๋ฅผ ๋ํ๋.
๋ก์ง์คํฑ ํ๊ท์ ๊ฐ์ด ์ ํ ๋ชจ๋ธ์ ์ค์ ํผ์ฒ๋ค์ ๊ฐ์ด ์ ๊ท ๋ถํฌ ํํ๋ฅผ ์ ์งํ๋ ๊ฒ์ ์ ํธ
Amount feature ๋ถํฌ๋

from sklearn.preprocessing import StandardScaler
# ์ฌ์ดํท๋ฐ์ StandardScaler๋ฅผ ์ด์ฉํ์ฌ ์ ๊ท๋ถํฌ ํํ๋ก Amount ํผ์ฒ๊ฐ ๋ณํํ๋ ๋ก์ง์ผ๋ก ์์ .
def get_preprocessed_df(df=None):
df_copy = df.copy()
scaler = StandardScaler()
# ๋ฐ์ดํฐ ์ ๊ทํ
amount_n = scaler.fit_transform(df_copy['Amount'].values.reshape(-1, 1))
# ๋ณํ๋ Amount๋ฅผ Amount_Scaled๋ก ํผ์ฒ๋ช
๋ณ๊ฒฝํ DataFrame๋งจ ์ ์ปฌ๋ผ์ผ๋ก ์
๋ ฅ
df_copy.insert(0, 'Amount_Scaled', amount_n)
# ๊ธฐ์กด Time, Amount ํผ์ฒ ์ญ์
df_copy.drop(['Time','Amount'], axis=1, inplace=True)
return df_copy
loglp()๋ก๊ทธ ๋ณํ์ ๋ฐ์ดํฐ ๋ถํฌ๋๊ฐ ์ฌํ๊ฒ ์๊ณก๋์ด ์์ ๊ฒฝ์ฐ ์ ์ฉํ๋ ์ค์ ๊ธฐ๋ฒ ์ค ํ๋
def get_preprocessed_df(df=None):
df_copy = df.copy()
# ๋ํ์ด์ log1p( )๋ฅผ ์ด์ฉํ์ฌ Amount๋ฅผ ๋ก๊ทธ ๋ณํ
amount_n = np.log1p(df_copy['Amount'])
df_copy.insert(0, 'Amount_Scaled', amount_n)
df_copy.drop(['Time','Amount'], axis=1, inplace=True)
return df_copy
๋ ์ด๋ธ์ด ๊ทน๋๋ก ๋ถ๊ท ์ผํ ๋ฐ์ดํฐ ์ธํธ์์ ๋ก์ง์คํฑ ํ๊ท๋ ๋ฐ์ดํฐ ๋ณํ ์ ์ฝ๊ฐ์ ๋ถ์์ ํ ์ฑ๋ฅ ๊ฒฐ๊ณผ๋ฅผ ๋ณด์ฌ์ค
์ฌ๋ถ์ ๊ฐ์ ํธ์ฐจ๋ฅผ ์ด์ฉํ๋ ๊ธฐ๋ฒ
25% ๊ตฌ๊ฐ์ธ Q1 ~ 75% ๊ตฌ๊ฐ์ธ Q3์ ๋ฒ์๋ฅผ IQR๋ผ๊ณ ํจ.
์ด์์ ๋ฐ์ดํฐ ๋ฒ์๋ฅผ ๋ฒ์ด๋ ๋ฐ์ดํฐ๋ฅผ ์ด์์น๋ก ๊ฐ์ฃผ

IQR ๋ฐฉ์์ ์๊ฐํํ ๋ํ๊ฐ ๋ฐ์ค ํ๋กฏ

๋จผ์ ์ด๋ค ํผ์ฒ์ ์ด์์น ๋ฐ์ดํฐ๋ฅผ ๊ฒ์ถํ ๊ฒ์ธ์ง ์ ํ์ด ํ์
๊ฒฐ์ ๊ฐ๊ณผ ๊ฐ์ฅ ์๊ด์ฑ์ด ๋์ ํผ์ฒ๋ค์ ์์ฃผ๋ก ์ด์์น๋ฅผ ๊ฒ์ถํ๋ ๊ฒ์ด ์ข์

V14์ ๋ํ ์ด์์น๋ฅผ ์ฐพ์ ์ ๊ฑฐ
import numpy as np
def get_outlier(df=None, column=None, weight=1.5):
# fraud์ ํด๋นํ๋ column ๋ฐ์ดํฐ๋ง ์ถ์ถ, 1/4 ๋ถ์์ 3/4 ๋ถ์ ์ง์ ์ np.percentile๋ก ๊ตฌํจ.
fraud = df[df['Class']==1][column]
quantile_25 = np.percentile(fraud.values, 25)
quantile_75 = np.percentile(fraud.values, 75)
# IQR์ ๊ตฌํ๊ณ , IQR์ 1.5๋ฅผ ๊ณฑํ์ฌ ์ต๋๊ฐ๊ณผ ์ต์๊ฐ ์ง์ ๊ตฌํจ.
iqr = quantile_75 - quantile_25
iqr_weight = iqr * weight
lowest_val = quantile_25 - iqr_weight
highest_val = quantile_75 + iqr_weight
# ์ต๋๊ฐ ๋ณด๋ค ํฌ๊ฑฐ๋, ์ต์๊ฐ ๋ณด๋ค ์์ ๊ฐ์ ์์๋ผ์ด์ด๋ก ์ค์ ํ๊ณ DataFrame index ๋ฐํ.
outlier_index = fraud[(fraud < lowest_val) | (fraud > highest_val)].index
return outlier_index
# get_processed_df( )๋ฅผ ๋ก๊ทธ ๋ณํ ํ V14 ํผ์ฒ์ ์ด์์น ๋ฐ์ดํฐ๋ฅผ ์ญ์ ํ๋ ๋ก์ง์ผ๋ก ๋ณ๊ฒฝ.
def get_preprocessed_df(df=None):
df_copy = df.copy()
amount_n = np.log1p(df_copy['Amount'])
df_copy.insert(0, 'Amount_Scaled', amount_n)
df_copy.drop(['Time','Amount'], axis=1, inplace=True)
# ์ด์์น ๋ฐ์ดํฐ ์ญ์ ํ๋ ๋ก์ง ์ถ๊ฐ
outlier_index = get_outlier(df=df_copy, column='V14', weight=1.5)
df_copy.drop(outlier_index, axis=0, inplace=True)
return df_copy
X_train, X_test, y_train, y_test = get_train_test_dataset(card_df)
print('### ๋ก์ง์คํฑ ํ๊ท ์์ธก ์ฑ๋ฅ ###')
get_model_train_eval(lr_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
print('### LightGBM ์์ธก ์ฑ๋ฅ ###')
get_model_train_eval(lgbm_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
์ด์์น ๋ฐ์ดํฐ๋ฅผ ์ ๊ฑฐํ ๋ค, ๋ก์ง์คํฑ ํ๊ท์ LightGBM ๋ชจ๋ ์์ธก ์ฑ๋ฅ์ด ํฌ๊ฒ ํฅ์
๋ฐ๋์ ํ์ต ๋ฐ์ดํฐ ์ธํธ๋ง ์ค๋ฒ ์ํ๋ง์ ํด์ผ ๋จ.
๊ฒ์ฆ/ํ ์คํธ ์ธํธ์ ์ ์ฉ์ ์ฌ๋ฐ๋ฅธ ๊ฒ์ฆ/ํ ์คํธ๊ฐ ๋ ์ ์์.
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)
print('SMOTE ์ ์ฉ ์ ํ์ต์ฉ ํผ์ฒ/๋ ์ด๋ธ ๋ฐ์ดํฐ ์ธํธ: ', X_train.shape, y_train.shape)
print('SMOTE ์ ์ฉ ํ ํ์ต์ฉ ํผ์ฒ/๋ ์ด๋ธ ๋ฐ์ดํฐ ์ธํธ: ', X_train_over.shape, y_train_over.shape)
print('SMOTE ์ ์ฉ ํ ๋ ์ด๋ธ ๊ฐ ๋ถํฌ: \n', pd.Series(y_train_over).value_counts())
๋๋ฌด๋ ๋ง์ Class=1 ๋ฐ์ดํฐ๋ฅผ ํ์ตํ๋ฉด์ ์ค์ ํ ์คํธ ๋ฐ์ดํฐ ์ธํธ์์ ์์ธก์ ์ง๋์น๊ฒ Class=1๋ก ์ ์ฉํด ์ ๋ฐ๋๊ฐ ๊ธ๊ฒฉํ ๋จ์ด์ง๊ฒ ๋จ.

์ฌํ์จ ์งํ๋ฅผ ๋์ด๋ ๊ฒ์ด ๋จธ์ ๋ฌ๋ ๋ชจ๋ธ์ ์ฃผ์ํ ๋ชฉํ์ผ ๊ฒฝ์ฐ SMOTE๋ฅผ ์ ์ฉํ๋ฉด ์ข์.

import zipfile as zf
files = zf.ZipFile("lish-moa.zip", 'r')
files.extractall("MoA")
files.close()
pip install category_encoders
SEED = 42
NFOLDS = 5
DATA_DIR = './MoA/'
np.random.seed(SEED)
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
test = pd.read_csv(DATA_DIR+'test_features.csv')
sub = pd.read_csv(DATA_DIR+'sample_submission.csv')
#drop id col
X = train.iloc[:, 1:].to_numpy()
X_test = test.iloc[:, 1:].to_numpy()
y = targets.iloc[:, 1:].to_numpy()
classifier = MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist'))
clf = Pipeline([('encode', CountEncoder(cols=[0, 2])),
('classify', classifier)
])
params = {'classify__estimator__colsample_bytree': 0.6522,
'classify__estimator__gamma': 3.6975,
'classify__estimator__learning_rate': 0.0503,
'classify__estimator__max_delta_step': 2.0706,
'classify__estimator__max_depth': 10,
'classify__estimator__min_child_weight': 31.5800,
'classify__estimator__n_estimators': 166,
'classify__estimator__subsample': 0.8639
}
_ = clf.set_params(**params)
oof_preds = np.zeros(y.shape) #OOF, Out of folds
test_preds = np.zeros((test.shape[0], y.shape[1]))
oof_losses = []
kf = KFold(n_splits = NFOLDS)
#5๊ฐ k-fold ๋ฐฉ์ ์ ์ฉ
for fn, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
print('Starting fold: ', fn)
#kf.split์ผ๋ก ์ถ์ถ๋ ํ์ต๊ณผ ๊ฒ์ฆ index๊ฐ์ผ๋ก ํ์ต๊ณผ ๊ฒ์ฆ ๋ฐ์ดํฐ ์ธํธ ๋ถ๋ฆฌ
X_train, X_val = X[trn_idx], X[val_idx]
y_train, y_val = y[trn_idx], y[val_idx]
#drop where cp_type==ct1_vehichle (baseline)
ct1_mask = X_train[:, 0] == 'ct1_vehicle'
X_train = X_train[~ct1_mask, :]
y_train = y_train[~ct1_mask]
#MultiOutputClassifier ํ์ต ์ํ
clf.fit(X_train, y_train)
val_preds = clf.predict_proba(X_val) #list of preds per class
val_preds = np.array(val_preds)[:,:,1].T #take the positive class
oof_preds[val_idx] = val_preds
loss = log_loss(np.ravle(y_val), np.ravel(val_preds))
oof_losses.append(loss)
preds = clf.predict_proba(X_test)
preds = np.array(preds)[:,:,1].T #take the postiive class
test_preds += preds / NFOLDS
print(oof_losses)
print('Mean OOF loss across folds', np.mean(oof_losses))
print('STD OOF loss across folds', np.std(oof_losses))
[Output]
Starting fold: 0
Starting fold: 1
Starting fold: 2
Starting fold: 3
Starting fold: 4
[0.0169781773377249, 0.01704491710861325, 0.016865153552168475, 0.01700900926983899, 0.01717882474706338]
Mean OOF loss across folds 0.017015216403081797
STD OOF loss across folds 0.00010156682747757948
# set control train preds to 0
control_mask = train['cp_type']=='ctl_vehicle'
oof_preds[control_mask] = 0
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(oof_preds)))
# set conrol test pres to 0
control_mask = test['cp_type'] == 'ctl_vehicle'
test_preds[control_mask] = 0
#create the submission file
sub.iloc[:,1:] = test_preds
sub.to_csv('submission.csv', index=False)
