요약 :
# 데이터 분석
import pandas as pd
import numpy as np
# 데이터 분석(시각화)
import matplotlib.pyplot as plt
import seaborn as sns
# ML 모델링
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
import xgboost
# 선형회귀
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
# RMSE
from sklearn.metrics import mean_squared_error
id : 샘플 아이디
Store : 쇼핑몰 지점
Date : 주 단위(Weekly) 날짜
Temperature : 해당 쇼핑몰 주변 기온
Fuel_Price : 해당 쇼핑몰 주변 연료 가격
Promotion 1~5 : 해당 쇼핑몰의 비식별화된 프로모션 정보
Unemployment : 해당 쇼핑몰 지역의 실업률
IsHoliday : 해당 기간의 공휴일 포함 여부
Weekly_Sales : 주간 매출액 (목표 예측값)
id : 샘플 아이디
Store : 쇼핑몰 지점
Date : 주 단위(Weekly) 날짜
Temperature : 해당 쇼핑몰 주변 기온
Fuel_Price : 해당 쇼핑몰 주변 연료 가격
Promotion 1~5 : 해당 쇼핑몰의 비식별화된 프로모션 정보
Unemployment : 해당 쇼핑몰 지역의 실업률
IsHoliday : 해당 기간의 공휴일 포함 여부
• 구글 드라이브를 연결 한 뒤, 작업디렉토리로 위치를 이동합니다.
from google.colab import drive
drive.mount('/content/drive/')
cd /content/drive/MyDrive/dacon
train_data = pd.read_csv("./dataset/train.csv")
train_data
test_data = pd.read_csv("./dataset/test.csv")
test_data
# 결측치 확인
train_data.isna().sum()
# 결측치 0으로 채우기
train_data=train_data.fillna(0)
train_data.isna().sum()
• 값이 어떤 데이터 타입의 값이 들어가 있는지 확인해줍니다.
print(train_data.columns)
print(train_data.dtypes)
print("행 열 :", train_data.shape)
• Bool 변수를 one-hot encoding을 통해서 0,1값으로 바꿔줍니다.
train_data["IsHoliday"] = train_data["IsHoliday"].astype(int)
• Date데이터의 연,월,일의 값이 object형태이기 때문에 값을 바꿔줍니다.
# Date데이터의 연월일을 변환
day =[]
month=[]
year=[]
for i in range(len(train_data["Date"])):
day.append(int(train_data.iloc[i]["Date"][0:2]))
month.append(int(train_data.iloc[i]["Date"][3:5]))
year.append(int(train_data.iloc[i]["Date"][6:]))
train_data["day"]=day
train_data["month"] = month
train_data["year"] = year
• pandas에 자동으로 histogram을 만들어 주는 함수인 .hist를 이용해서 histogram을 만들어줍니다.
train_data.hist(figsize=(30,20))
• 이제 데이터의 값의 분포를 describe함수를 통해 숫자로 확인합니다. 각 값이 mean, std, min을 살펴 분포가 어떻게 되어 있는지 파악합니다.
train_data.describe()
# id 는 제외하고 정규화 진행
norm = train_data.drop(['id','Date', 'Weekly_Sales'],axis=1)
# z-정규화( x-평균/표준편차)
train_data_normed = (norm- norm.mean())/norm.std()
train_data_normed
train_data_normed.hist(figsize=(30,20))
analysis = pd.merge(train_data_normed, train_data['Weekly_Sales'],
left_index = True, right_index=True)
# 선형성 확인
plt.figure(figsize=(16,16))
sns.heatmap(analysis.corr(), linewidths=.5, cmap = 'Blues', annot=True)
#pairplot with Seaborn
sns.pairplot(analysis,hue='Weekly_Sales')
plt.show()
y_target = train_data['Weekly_Sales']
x_data = train_data_normed.drop(['Unemployment','IsHoliday','Promotion4', 'day', 'month'],axis=1)
train_x, test_x, train_y, test_y = train_test_split(x_data, y_target, train_size=0.9, test_size=0.1,random_state = 7)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)
# 임의의 변수로 초기화 뒤 학습
test_x=sm.add_constant(test_x,has_constant='add')# 상수항추가
train_x=sm.add_constant(train_x,has_constant='add')# 상수항
model = sm.OLS(train_y, train_x)
fitted_model = model.fit()
fitted_model.summary()
# 다중 공산성 확인
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(
x_data.values, i) for i in range(x_data.shape[1])]
vif["features"] = x_data.columns
vif
plt.plot(np.array(fitted_model.predict(train_x)),label="pred")
plt.plot(np.array(train_y),label="true")
plt.legend()
plt.show()
plt.plot(np.array(fitted_model.predict(test_x)),label="pred")
plt.plot(np.array(test_y),label="true")
plt.legend()
plt.show()
print("RMSE: ", mean_squared_error(test_y, fitted_model.predict(test_x))**0.5)
RMSE: 497323.21422187285
decision_tree_model = DecisionTreeRegressor()
bagging_decision_tree_model = BaggingRegressor(base_estimator = decision_tree_model, # 의사결정나무 모형
n_estimators = 5, # 5번 샘플링
verbose = 1, random_state=1) # 학습 과정 표시
tree_model = bagging_decision_tree_model.fit(train_x, train_y) # 학습 진행
predict = tree_model.predict(test_x) # 학습된 Bagging 의사결정나무 모형으로 평가 데이터 예측
print("RMSE: {}".format((mean_squared_error(predict, test_y)**0.5))) # RMSE 결과
RMSE: 139250.1423912945
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s finished
for i in range(1,17):
for j in range(100,700,100):
RFR_model= RandomForestRegressor(n_estimators=j, max_depth=i,random_state=1)
RFR_model.fit(train_x, train_y)
predict = RFR_model.predict(test_x)
print("n_estimators=",j, "max_depth=",i)
print("RMSE: {}".format((mean_squared_error(predict, test_y)**0.5)))
train_x = train_x.drop(['Promotion2','year'],axis=1)
test_x = test_x.drop(['Promotion2','year'],axis=1)
for i in range(1,10):
for j in range(100,1000,100):
xgb_model = xgboost.XGBRegressor(n_estimators=j, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=i)
xgb_model.fit(train_x,train_y)
predict = xgb_model.predict(test_x)
print("n_estimators=",j, "max_depth=",i)
print("RMSE: {}".format((mean_squared_error(predict, test_y)**0.5)))
OUTPUT
n_estimators= 100 max_depth= 1
RMSE: 460015.2693717782
n_estimators= 200 max_depth= 1
RMSE: 438646.87092703284
n_estimators= 300 max_depth= 1
RMSE: 420736.6539690999
n_estimators= 400 max_depth= 1
RMSE: 406708.72215318587
n_estimators= 500 max_depth= 1
RMSE: 393728.2546447163
n_estimators= 600 max_depth= 1
RMSE: 381169.74412371183
n_estimators= 700 max_depth= 1
RMSE: 370252.5891072115
n_estimators= 800 max_depth= 1
RMSE: 360340.88390494377
n_estimators= 900 max_depth= 1
RMSE: 351164.42362712265
n_estimators= 100 max_depth= 2
RMSE: 306459.3250344036
n_estimators= 200 max_depth= 2
RMSE: 227511.00218503436
n_estimators= 300 max_depth= 2
RMSE: 184570.61014635224
n_estimators= 400 max_depth= 2
RMSE: 166715.09789379605
n_estimators= 500 max_depth= 2
RMSE: 154806.26364147512
n_estimators= 600 max_depth= 2
RMSE: 145513.16822728465
n_estimators= 700 max_depth= 2
RMSE: 139210.25451989096
n_estimators= 800 max_depth= 2
RMSE: 135226.37023649184
n_estimators= 900 max_depth= 2
RMSE: 131300.71684834707
n_estimators= 100 max_depth= 3
RMSE: 230008.12321863492
n_estimators= 200 max_depth= 3
RMSE: 156790.0151757414
n_estimators= 300 max_depth= 3
RMSE: 137698.39584197063
n_estimators= 400 max_depth= 3
RMSE: 128546.74746110164
n_estimators= 500 max_depth= 3
RMSE: 122741.11406964035
n_estimators= 600 max_depth= 3
RMSE: 120653.50121949562
n_estimators= 700 max_depth= 3
RMSE: 117548.72922108823
n_estimators= 800 max_depth= 3
RMSE: 116065.74059664998
n_estimators= 900 max_depth= 3
RMSE: 114153.11420794841
n_estimators= 100 max_depth= 4
RMSE: 166817.01695602635
n_estimators= 200 max_depth= 4
RMSE: 127788.61126348235
n_estimators= 300 max_depth= 4
RMSE: 117718.12882537133
n_estimators= 400 max_depth= 4
RMSE: 114488.61584448442
n_estimators= 500 max_depth= 4
RMSE: 111666.8418078547
n_estimators= 600 max_depth= 4
RMSE: 110229.68478890423
n_estimators= 700 max_depth= 4
RMSE: 109170.83166226344
n_estimators= 800 max_depth= 4
RMSE: 108568.98924373678
n_estimators= 900 max_depth= 4
RMSE: 107365.36622920564
n_estimators= 100 max_depth= 5
RMSE: 142063.67794971686
n_estimators= 200 max_depth= 5
RMSE: 114968.18874273293
n_estimators= 300 max_depth= 5
RMSE: 108744.42616551125
n_estimators= 400 max_depth= 5
RMSE: 105988.58299577776
n_estimators= 500 max_depth= 5
RMSE: 104290.92672024037
n_estimators= 600 max_depth= 5
RMSE: 103439.03227342005
n_estimators= 700 max_depth= 5
RMSE: 103046.3295739929
n_estimators= 800 max_depth= 5
RMSE: 102130.05932015614
n_estimators= 900 max_depth= 5
RMSE: 101592.91557067417
n_estimators= 100 max_depth= 6
RMSE: 131861.4760764873
n_estimators= 200 max_depth= 6
RMSE: 114358.91940001518
n_estimators= 300 max_depth= 6
RMSE: 109581.71701103379
n_estimators= 400 max_depth= 6
RMSE: 107599.56514532154
n_estimators= 500 max_depth= 6
RMSE: 105621.65076628176
n_estimators= 600 max_depth= 6
RMSE: 104761.0296357656
n_estimators= 700 max_depth= 6
RMSE: 104353.59141184231
n_estimators= 800 max_depth= 6
RMSE: 104003.37041523591
n_estimators= 900 max_depth= 6
RMSE: 103862.50802285597
n_estimators= 100 max_depth= 7
RMSE: 119340.19515015816
n_estimators= 200 max_depth= 7
RMSE: 107819.72082601112
n_estimators= 300 max_depth= 7
RMSE: 104259.22298844173
n_estimators= 400 max_depth= 7
RMSE: 102168.19407450572
n_estimators= 500 max_depth= 7
RMSE: 101106.68511514353
n_estimators= 600 max_depth= 7
RMSE: 100813.9535699186
n_estimators= 700 max_depth= 7
RMSE: 100810.71444835368
n_estimators= 800 max_depth= 7
RMSE: 100620.56710537343
n_estimators= 900 max_depth= 7
RMSE: 100355.97664846746
n_estimators= 100 max_depth= 8
RMSE: 111915.80985305535
n_estimators= 200 max_depth= 8
RMSE: 104762.56798154968
n_estimators= 300 max_depth= 8
RMSE: 102551.65280479609
n_estimators= 400 max_depth= 8
RMSE: 101987.95647842406
n_estimators= 500 max_depth= 8
RMSE: 101733.19953484504
n_estimators= 600 max_depth= 8
RMSE: 101581.83746708208
n_estimators= 700 max_depth= 8
RMSE: 101637.00429896996
n_estimators= 800 max_depth= 8
RMSE: 101587.87696639916
n_estimators= 900 max_depth= 8
RMSE: 101581.70661780362
n_estimators= 100 max_depth= 9
RMSE: 110798.69250859715
n_estimators= 200 max_depth= 9
RMSE: 105756.71740331504
n_estimators= 300 max_depth= 9
RMSE: 104304.6924234451
n_estimators= 400 max_depth= 9
RMSE: 104182.49687204252
n_estimators= 500 max_depth= 9
RMSE: 104174.33804807537
n_estimators= 600 max_depth= 9
RMSE: 104197.34682020044
n_estimators= 700 max_depth= 9
RMSE: 104243.20555470382
n_estimators= 800 max_depth= 9
RMSE: 104237.35570322191
n_estimators= 900 max_depth= 9
RMSE: 104263.23627733768
xgb_model = xgboost.XGBRegressor(n_estimators=500, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=10)
xgb_model.fit(train_x,train_y)
predict = xgb_model.predict(test_x)
print("RMSE: {}".format((mean_squared_error(predict, test_y)**0.5)))
RMSE: 104132.56244340152
xgboost.plot_importance(xgb_model)
for j in range(100,1000,100):
Adaboost_model = AdaBoostRegressor(n_estimators=j,random_state=1)
Adaboost_model.fit(train_x,train_y)
predict = Adaboost_model.predict(test_x)
print("n_estimators=",j)
print("RMSE: {}".format((mean_squared_error(predict, test_y)**0.5)))
#전처리
test_data["IsHoliday"] = test_data["IsHoliday"].astype(int)
test_data=test_data.fillna(0)
# Date데이터의 연월일을 변환
day =[]
month=[]
year=[]
for i in range(len(test_data["Date"])):
day.append(int(test_data.iloc[i]["Date"][0:2]))
month.append(int(test_data.iloc[i]["Date"][3:5]))
year.append(int(test_data.iloc[i]["Date"][6:]))
test_data["day"]=day
test_data["month"] = month
test_data["year"] = year
# 필요없는 데이터 제외하고
normed = test_data.drop(['id','Date','year','Unemployment','IsHoliday','Promotion4', 'day', 'month','Promotion2'],axis=1)
# z-정규화
test_data_normed = (normed- normed.mean())/normed.std()
test_data_normed=test_data_normed.fillna(0)
test_data_normed
predict = xgb_model.predict(test_data_normed)
sample_submission = pd.read_csv('./dataset/sample_submission.csv')
sample_submission['Weekly_Sales'] = predict
sample_submission.to_csv('submission.csv',index = False)
sample_submission.head()