출처: https://dacon.io/competitions/open/235536/codeshare/2721?page=1&dtype=recent
공유된 코드에 기반한 학습용 게시물입니다.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv("movies_train.csv")
test = pd.read_csv('movies_test.csv')
submission = pd.read_csv('submission.csv')
train.head(3)
test.head(3)
submission.head(3)
train['distributor'] = train.distributor.str.replace("(주)", '')
test['distributor'] = test.distributor.str.replace("(주)", '')
train['distributor'] = [re.sub(r'[^0-9a-zA-Z가-힣]', '', x) for x in train.distributor]
test['distributor'] = [re.sub(r'[^0-9a-zA-Z가-힣]', '', x) for x in test.distributor]
def get_dis(x) :
if 'CJ' in x or 'CGV' in x :
return 'CJ'
elif '쇼박스' in x :
return '쇼박스'
elif 'SK' in x :
return 'SK'
elif '리틀빅픽' in x :
return '리틀빅픽처스'
elif '스폰지' in x :
return '스폰지'
elif '싸이더스' in x :
return '싸이더스'
elif '에이원' in x :
return '에이원'
elif '마인스' in x :
return '마인스'
elif '마운틴픽' in x :
return '마운틴픽처스'
elif '디씨드' in x :
return '디씨드'
elif '드림팩트' in x :
return '드림팩트'
elif '메가박스' in x :
return '메가박스'
elif '마운틴' in x :
return '마운틴'
else :
return x
train['distributor'] = train.distributor.apply(get_dis)
test['distributor'] = test.distributor.apply(get_dis)
train.groupby('genre').box_off_num.mean().sort_values()
train['genre_rank'] = train.genre.map({'뮤지컬' : 1, '다큐멘터리' : 2, '서스펜스' : 3, '애니메이션' : 4, '멜로/로맨스' : 5,
'미스터리' : 6, '공포' : 7, '드라마' : 8, '코미디' : 9, 'SF' : 10, '액션' : 11, '느와르' : 12})
test['genre_rank'] = test.genre.map({'뮤지컬' : 1, '다큐멘터리' : 2, '서스펜스' : 3, '애니메이션' : 4, '멜로/로맨스' : 5,
'미스터리' : 6, '공포' : 7, '드라마' : 8, '코미디' : 9, 'SF' : 10, '액션' : 11, '느와르' : 12})
tr_nm_rank = train.groupby('distributor').box_off_num.median().reset_index(name = 'num_rank').sort_values(by = 'num_rank')
tr_nm_rank
tr_nm_rank['num_rank'] = [i + 1 for i in range(tr_nm_rank.shape[0])]
tr_nm_rank
train = pd.merge(train, tr_nm_rank, how = 'left')
test = pd.merge(test, tr_nm_rank, how = 'left')
test.fillna(0, inplace = True)
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from ngboost import NGBRegressor
X = train[['num_rank', 'time', 'num_staff', 'num_actor', 'genre_rank', 'screening_rat']]
y = np.log1p(train.box_off_num)
X = pd.get_dummies(columns = ['screening_rat'], data = X)
X['num_actor'] = np.log1p(X['num_actor'])
target = test[['num_rank', 'time', 'num_staff', 'num_actor', 'genre_rank', 'screening_rat']]
target = pd.get_dummies(columns = ['screening_rat'], data = target)
target['num_actor'] = np.log1p(target['num_actor'])
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
gbm = GradientBoostingRegressor(random_state = 42)
rmse_list = []
gb_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
gbm.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in gbm.predict(val_x)])
sub_pred = np.expm1([0 if x < 0 else x for x in gbm.predict(target)])
rmse = np.sqrt(mean_squared_error(val_y, pred))
rmse_list.append(rmse)
gb_pred += (sub_pred / 10)
np.mean(rmse_list)
ngb = NGBRegressor(random_state = 518)
rmse_list = []
ngb_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
ngb.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in ngb.predict(val_x)])
sub_pred = np.expm1([0 if x < 0 else x for x in ngb.predict(target)])
rmse = np.sqrt(mean_squared_error(val_y, pred))
rmse_list.append(rmse)
ngb_pred += (sub_pred / 10)
np.mean(rmse_list)
lgbm = LGBMRegressor(random_state = 518)
rmse_list = []
lgb_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
lgbm.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(val_x)])
sub_pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(target)])
rmse = np.sqrt(mean_squared_error(val_y, pred))
rmse_list.append(rmse)
lgb_pred += (sub_pred / 10)
np.mean(rmse_list)
xgb = XGBRegressor(random_state = 518)
rmse_list = []
xgb_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
xgb.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in xgb.predict(val_x)])
sub_pred = np.expm1([0 if x < 0 else x for x in xgb.predict(target)])
rmse = np.sqrt(mean_squared_error(val_y, pred))
rmse_list.append(rmse)
xgb_pred += (sub_pred / 10)
np.mean(rmse_list)
cat = CatBoostRegressor(random_state = 518, silent = True)
rmse_list = []
cat_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
cat.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in cat.predict(val_x)])
sub_pred = np.expm1([0 if x < 0 else x for x in cat.predict(target)])
rmse = np.sqrt(mean_squared_error(val_y, pred))
rmse_list.append(rmse)
cat_pred += (sub_pred / 10)
np.mean(rmse_list)
rf = RandomForestRegressor(random_state = 518)
rmse_list = []
rf_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y):
tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
rf.fit(tr_x, tr_y)
pred = np.expm1([0 if x < 0 else x for x in rf.predict(val_x)])
sub_pred = np.expm1([0 if x < 0 else x for x in rf.predict(target)])
rmse = np.sqrt(mean_squared_error(val_y, pred))
rmse_list.append(rmse)
rf_pred += (sub_pred / 10)
np.mean(rmse_list)
submission['box_off_num'] = (xgb_pred + cat_pred + lgb_pred + rf_pred + gb_pred + ngb_pred) / 6
submission.sort_values(by = 'box_off_num')
submission.to_csv("1101.csv", index = False)