pd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197428 entries, 0 to 197427
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 market_id 196441 non-null float64
1 created_at 197428 non-null object
2 actual_delivery_time 197421 non-null object
3 store_id 197428 non-null int64
4 store_primary_category 192668 non-null object
5 order_protocol 196433 non-null float64
6 total_items 197428 non-null int64
7 subtotal 197428 non-null int64
8 num_distinct_items 197428 non-null int64
9 min_item_price 197428 non-null int64
10 max_item_price 197428 non-null int64
11 total_onshift 181166 non-null float64
12 total_busy 181166 non-null float64
13 total_outstanding_orders 181166 non-null float64
14 estimated_order_place_duration 197428 non-null int64
15 estimated_store_to_consumer_driving_duration 196902 non-null float64
dtypes: float64(6), int64(7), object(3)
memory usage: 24.1+ MB
๊ฒฐ์ธก์น์ ๋ฒ์๊ฐ ๊ฒน์น๋ ๋ถ๋ถ์ด ๋ง์ง ์์ผ๋ฉฐ ๋๋ถ๋ถ ํ๊ท ์น๋ฅผ ๋ด๊ธฐ๋ ์ด๋ ค์ด ๋ถ๋ถ์ธ ๊ด๊ณ๋ก ๋ชจ๋ ์ญ์ ํ๋๊ฒ์ผ๋ก ๊ฒฐ๋ก ์ ๋ด๋ ธ๋ค.
๊ฒฐ์ธก์น๋ฅผ ์ฑ์ฐ๋ ๋ฐฉ์๋ ์งํํด ๋ณด์์ง๋ง ํฐ ๋ณํ๊ฐ ์๊ธฐ๋ ํ์๋ค.
df = df.dropna()
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 176016 entries, 0 to 197427
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 market_id 176016 non-null object
1 created_at 176016 non-null object
2 actual_delivery_time 176016 non-null object
3 store_id 176016 non-null int64
4 store_primary_category 176016 non-null object
5 order_protocol 176016 non-null float64
6 total_items 176016 non-null int64
7 subtotal 176016 non-null int64
8 num_distinct_items 176016 non-null int64
9 min_item_price 176016 non-null int64
10 max_item_price 176016 non-null int64
11 total_onshift 176016 non-null float64
12 total_busy 176016 non-null float64
13 total_outstanding_orders 176016 non-null float64
14 estimated_order_place_duration 176016 non-null int64
15 estimated_store_to_consumer_driving_duration 176016 non-null float64
dtypes: float64(5), int64(7), object(4)
memory usage: 22.8+ MB
actual_delivery_time์์ created_at๋ฅผ 60์ผ๋ก ๋๋ ๊ฐ์ ๋นผ์ค delivery_duration ์ด์ ์์ฑํด์ค๋ค.
# datetime ํ์์ datetime64๋ก ๋ณํ
df['created_at'] = pd.to_datetime(df['created_at'])
df['actual_delivery_time'] = pd.to_datetime(df['actual_delivery_time'])
# ๋ฐฐ๋ฌ ์๊ฐ ๊ณ์ฐ
df['delivery_duration'] = (df['actual_delivery_time'] - df['created_at']).dt.total_seconds() / 60
# ๋ถํ์ํ ์ด ์ ๊ฑฐ
df = df.drop(['created_at', 'actual_delivery_time'], axis=1)
# ๋ฒ์ฃผํ ๋ณ์ ์ธ์ฝ๋ฉ
categorical_cols = ['store_primary_category']
numerical_cols = df.columns.drop(['store_primary_category', 'delivery_duration'])
preprocessor = ColumnTransformer(
transformers=[
('num', SimpleImputer(strategy='mean'), numerical_cols),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])
# ํน์ฑ๊ณผ ํ๊ฒ ๋ถ๋ฆฌ
X = df.drop('delivery_duration', axis=1)
y = df['delivery_duration']
# ๋ฐ์ดํฐ ๋ถํ
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
๋ชจ๋ธ์ XGBoost๋ฅผ ์ฌ์ฉํ๊ธฐ๋ก ํ๋ค.
๋จผ์ Randomforest๋ฅผ ์ฌ์ฉํด๋ณด์์ง๋ง ์ฝ 30๋ถ๊ฐ ๋ชจ๋ธ ํ์ต ์๊ฐ์ ๊ฐ์ก์ง๋ง ์ด๋ ํ ์ด์ ์ธ์ง ๋ชจ๋ฅด๊ฒ ์ง๋ง ํ์ต์ด ์๋ฃ๋์ง ์์ ๋ค๋ฅธ ์๊ณ ๋ฆฌ์ฆ์ ์ ํํ๊ฒ ๋์๋ค.
XGBoost๋ ๋จผ์ ๋ถ์คํ
์๊ณ ๋ฆฌ์ฆ์ ๊ธฐ๋ฐ์ผ๋ก ํ๋ค.
๋ถ์คํ
์ด๋ Randomforest์์ ๊ทธ ๋ค์ ์ธ๋๋ก ์งํํ๊ฒ ๋๋ ์ค์ํ ๊ฐ๋
์ด๋ค.
Randomforest๋ ๊ฐ๊ฐ์ ํธ๋ฆฌ๋ฅผ ๋
๋ฆฝ์ ์ผ๋ก ๋ง๋๋ ๋ฐ๋ฉด ๋ถ์คํ
์ ํธ๋ฆฌ๋ฅผ ์์ฐจ์ ์ผ๋ก ๋ง๋ค๋ฉด์ ์ด์ ํธ๋ฆฌ์์ ํ์ตํ ๋ด์ฉ์ด ๋ค์ ํ์ต์ ๋ฐ์๋๋ค.
์ด๋ RNN, DNN๊ณผ ์ ์ฌํ๋ค.
# DMatrix ์์ฑ
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# XGBoost ํ๋ผ๋ฏธํฐ ์ค์
params = {
'objective': 'reg:squarederror', # reg:squarederror๋ก ์ค์ ํ์ฌ MSE์ฌ์ฉ
'eval_metric': 'rmse', # ํ๊ฐ ์งํ๋ก RMSE ์ฌ์ฉ
'learning_rate': 0.1,
'max_depth': 6,
'seed': 42,
'tree_method': 'gpu_hist' # GPU ์ฌ์ฉ ์ค์
}
# ํ์ต ๊ณผ์ ๋ชจ๋ํฐ๋ง
num_boost_round = 100
evals = [(dtrain, 'train'), (dtest, 'eval')]
progress = {}
model = xgb.train(
params,
dtrain,
num_boost_round=num_boost_round,
evals=evals,
evals_result=progress,
verbose_eval=True
)
import matplotlib.pyplot as plt
train_rmse = progress['train']['rmse']
eval_rmse = progress['eval']['rmse']
plt.figure(figsize=(10, 7))
plt.plot(train_rmse, label='Train RMSE')
plt.plot(eval_rmse, label='Eval RMSE')
plt.xlabel('Number of Rounds')
plt.ylabel('RMSE')
plt.title('RMSE over Training Rounds')
plt.legend()
plt.show()

# ์ต์ข
๋ชจ๋ธ ํ๊ฐ
y_pred = model.predict(dtest)
# ํ๊ฐ ์งํ ๊ณ์ฐ
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
# Under-prediction์ ๋น์จ ๊ณ์ฐ
under_predictions = np.sum(y_pred < y_test)
under_prediction_ratio = under_predictions / len(y_test)
print(f"pred: {y_pred}")
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Square Error: {rmse}')
print(f'Under-prediction ratio: {under_prediction_ratio}')
y_pred: [51.80057 60.28056 37.950516 ... 50.746944 47.010056 37.548664]
Mean Absolute Error: 10.915870883989538
Mean Squared Error: 295.4411207884746
Root Mean Square Error: 17.1884007629702
Under-prediction ratio: 0.4160314028899761