[Dacon/machine learning] FIFA 선수 이적료 예측 연습

hottogi·2022년 10월 31일
0

출처: https://dacon.io/competitions/open/235538/codeshare/2725?page=2&dtype=recent

공유된 코드에 기반한 학습용 게시물입니다.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from ngboost import NGBRegressor

train = pd.read_csv('FIFA_train.csv')
test = pd.read_csv('FIFA_test.csv')
submission = pd.read_csv('submission.csv')

train.head()
test.head()
submission.head()

def con_period(x):
    if x == 'Dec 31, 2018' :
        return '2019'
    elif x == 'Jun 30, 2020' :
        return '2020.5'
    elif x == 'Jun 30, 2019' :
        return '2019.5'
    elif x == 'May 31, 2020' :
        return '2020.3333'
    elif x == 'May 31, 2019' :
        return '2019.3333'
    elif x == 'Jan 31, 2019' :
        return '2019.0833'
    elif x == 'Jan 1, 2019' :
        return '2019'
    elif x == 'Jan 12, 2019' :
        return '2019.034'
    elif x == 'Dec 31, 2019' :
        return '2020'
    elif x == 'Jun 1, 2019' :
        return '2019.416'
    else :
        return x

train.contract_until = train.contract_until.apply(con_period).astype('float64') - 2018
test.contract_until = test.contract_until.apply(con_period).astype('float64') - 2018

train[['age', 'stat_potential']] = np.log1p(train[['age', 'stat_potential']])
test[['age', 'stat_potential']] = np.log1p(test[['age', 'stat_potential']])

X = train[['age', 'continent', 'contract_until', 'position', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']]
y = np.log1p(train['value'])

X = pd.get_dummies(columns = ['continent', 'position'], data = X)

target = test[['age', 'continent', 'contract_until', 'position', 'reputation', 'stat_overall', 'stat_potential', 'stat_skill_moves']]
target = pd.get_dummies(columns = ['continent', 'position'], data = target)

kf = KFold(n_splits = 10, random_state = 521, shuffle = True)
ngb = NGBRegressor(random_state = 521, verbose = 500, n_estimators = 500)

ngb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    ngb.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in ngb.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)

    sub_pred = np.expm1([0 if x < 0 else x for x in ngb.predict(target)]) / 10
    ngb_pred += sub_pred
print(f'{ngb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

rf = RandomForestRegressor(random_state = 521, n_estimators = 150)

rf_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    rf.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in rf.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)

    sub_pred = np.expm1([0 if x < 0 else x for x in rf.predict(target)]) / 10
    rf_pred += sub_pred
print(f'{rf.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

etc = ExtraTreesRegressor(random_state = 521, n_estimators = 500)
etc_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    etc.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in etc.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)

    sub_pred = np.expm1([0 if x < 0 else x for x in etc.predict(target)]) / 10
    etc_pred += sub_pred
print(f'{etc.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

gb = GradientBoostingRegressor(random_state = 521, max_depth = 5)
gb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    gb.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in gb.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)

    sub_pred = np.expm1([0 if x < 0 else x for x in gb.predict(target)]) / 10
    gb_pred += sub_pred
print(f'{gb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

cb = CatBoostRegressor(random_state = 521, silent = True, depth = 3)
cb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    cb.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in cb.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)

    sub_pred = np.expm1([0 if x < 0 else x for x in cb.predict(target)]) / 10
    cb_pred += sub_pred
print(f'{cb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

lgbm = LGBMRegressor(random_state = 521, max_depth = 4, n_estimators = 1000)

lgbm_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    lgbm.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)

    sub_pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(target)]) / 10
    lgbm_pred += sub_pred
print(f'{lgbm.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

xgb = XGBRegressor(random_state = 521, max_depth = 5)
xgb_pred = np.zeros((target.shape[0]))
rmse_list = []
for tr_idx, val_idx in kf.split(X, y):
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    xgb.fit(tr_x, tr_y)
    pred = np.expm1([0 if x < 0 else x for x in xgb.predict(val_x)])

    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), pred))
    rmse_list.append(rmse)

    sub_pred = np.expm1([0 if x < 0 else x for x in xgb.predict(target)]) / 10
    xgb_pred += sub_pred
print(f'{xgb.__class__.__name__}의 10fold 평균 RMSE는 {np.mean(rmse_list)}')

submission['value'] = (gb_pred + rf_pred + etc_pred + lgbm_pred + cb_pred + ngb_pred + xgb_pred) / 7

q1 = submission['value'].quantile(0.004)
q2 = submission['value'].quantile(0.99)

submission['value'] = submission['value'].apply(lambda x: x if x > q1 else x * 0.9)
submission['value'] = submission['value'].apply(lambda x: x if x < q2 else x * 1.1)
submission
submission.to_csv('fifa0520.csv', index = False)
profile

0개의 댓글

관련 채용 정보