출처: https://dacon.io/competitions/open/235538/codeshare/2725?page=2&dtype=recent
공유된 코드에 기반한 학습용 게시물입니다.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from ngboost import NGBRegressor
train = pd.read_csv('FIFA_train.csv')
test = pd.read_csv('FIFA_test.csv')
submission = pd.read_csv('submission.csv')
train.head()
test.head()
submission.head()
def con_period(x):
if x == 'Dec 31, 2018' :
return '2019'
elif x == 'Jun 30, 2020' :
return '2020.5'
elif x == 'Jun 30, 2019' :
return '2019.5'
elif x == 'May 31, 2020' :
return '2020.3333'
elif x == 'May 31, 2019' :
return '2019.3333'
elif x == 'Jan 31, 2019' :
return '2019.0833'
elif x == 'Jan 1, 2019' :
return '2019'
elif x == 'Jan 12, 2019' :
return '2019.034'
elif x == 'Dec 31, 2019' :
return '2020'
elif x == 'Jun 1, 2019' :
return '2019.416'
else :
return x
train.contract_until = train.contract_until.apply(con_period).astype('float64') - 2018
test.contract_until = test.contract_until.apply(con_period).astype('float64') - 2018
train[['age', 'stat_potential']] = np.log1p(train[['age', 'stat_potential']])
test[['age', 'stat_potential']] = np.log1p(test[['age', 'stat_potential']])
X = train[['age', 'continent', 'contract_until', 'position', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']]
y = np.log1p(train['value'])
X = pd.get_dummies(columns = ['continent', 'position'], data = X)
target = test[['age', 'continent', 'contract_until', 'position', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']]
target = pd.get_dummies(columns = ['continent', 'position'], data = target)
kf = KFold(n_splits = 10, random_state = 521, shuffle = True)
ngb = NGBRegressor(random_state = 521, verbose = 500, n_estimators = 500)
ngb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
ngb.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in ngb.predict(val_x)])
rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
rmse_list.append(rmse)
sub_pred = np.expm1([0 if x < 0 else x for x in ngb.predict(target)]) / 10
ngb_pred += sub_pred
print(f'{ngb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')
rf = RandomForestRegressor(random_state = 521, n_estimators = 150)
rf_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
rf.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in rf.predict(val_x)])
rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
rmse_list.append(rmse)
sub_pred = np.expm1([0 if x < 0 else x for x in rf.predict(target)]) / 10
rf_pred += sub_pred
print(f'{rf.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')
etc = ExtraTreesRegressor(random_state = 521, n_estimators = 500)
etc_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
etc.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in etc.predict(val_x)])
rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
rmse_list.append(rmse)
sub_pred = np.expm1([0 if x < 0 else x for x in etc.predict(target)]) / 10
etc_pred += sub_pred
print(f'{etc.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')
gb = GradientBoostingRegressor(random_state = 521, max_depth = 5)
gb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
gb.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in gb.predict(val_x)])
rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
rmse_list.append(rmse)
sub_pred = np.expm1([0 if x < 0 else x for x in gb.predict(target)]) / 10
gb_pred += sub_pred
print(f'{gb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')
cb = CatBoostRegressor(random_state = 521, silent = True, depth = 3)
cb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
cb.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in cb.predict(val_x)])
rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
rmse_list.append(rmse)
sub_pred = np.expm1([0 if x < 0 else x for x in cb.predict(target)]) / 10
cb_pred += sub_pred
print(f'{cb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')
lgbm = LGBMRegressor(random_state = 521, max_depth = 4, n_estimators = 1000)
lgbm_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
lgbm.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(val_x)])
rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
rmse_list.append(rmse)
sub_pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(target)]) / 10
lgbm_pred += sub_pred
print(f'{lgbm.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')
xgb = XGBRegressor(random_state = 521, max_depth = 5)
xgb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
xgb.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in xgb.predict(val_x)])
rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
rmse_list.append(rmse)
sub_pred = np.expm1([0 if x < 0 else x for x in xgb.predict(target)]) / 10
xgb_pred += sub_pred
print(f'{xgb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')
submission['value'] = (gb_pred + rf_pred + etc_pred + lgbm_pred + cb_pred + ngb_pred + xgb_pred) / 7
q1 = submission['value'].quantile(0.004)
q2 = submission['value'].quantile(0.99)
submission['value'] = submission['value'].apply(lambda x: x if x > q1 else x * 0.9)
submission['value'] = submission['value'].apply(lambda x: x if x < q2 else x * 1.1)
submission
submission.to_csv('fifa0520.csv', index = False)