결과값이 '0'인 'windspeed' 데이터 채워넣기
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
mpl.rcParams['axes.unicode_minus'] = False
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv("data_bike/train.csv", parse_dates=['datetime'])
train.head(2)
|
datetime |
season |
holiday |
workingday |
weather |
temp |
atemp |
humidity |
windspeed |
casual |
registered |
count |
0 |
2011-01-01 00:00:00 |
1 |
0 |
0 |
1 |
9.84 |
14.395 |
81 |
0.0 |
3 |
13 |
16 |
1 |
2011-01-01 01:00:00 |
1 |
0 |
0 |
1 |
9.02 |
13.635 |
80 |
0.0 |
8 |
32 |
40 |
Description
- datetime - hourly date + timestamp
- season - 1 = spring, 2 = summer, 3 = fall, 4 = winter
- holiday - whether the day is considered a holiday
- workingday - whether the day is neither a weekend nor holiday
- weather
1: Clear, Few clouds, Partly cloudy, Partly cloudy
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp - temperature in Celsius
- atemp - "feels like" temperature in Celsius
- humidity - relative humidity
- windspeed - wind speed
- casual - number of non-registered user rentals initiated
- registered - number of registered user rentals initiated
- count - number of total rentals
import missingno as msno
msno.matrix(train, figsize=(12,5));
test = pd.read_csv('data_bike/test.csv', parse_dates=['datetime'])
print(test.shape)
test.head(2)
(6493, 9)
|
datetime |
season |
holiday |
workingday |
weather |
temp |
atemp |
humidity |
windspeed |
0 |
2011-01-20 00:00:00 |
1 |
0 |
1 |
1 |
10.66 |
11.365 |
56 |
26.0027 |
1 |
2011-01-20 01:00:00 |
1 |
0 |
1 |
1 |
10.66 |
13.635 |
56 |
0.0000 |
Categorical = ['season', 'holiday', 'workingday', 'weather']
for col in Categorical:
train[col] = train[col].astype('category')
test[col] = test[col].astype('category')
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['minute'] = train['datetime'].dt.minute
train['second'] = train['datetime'].dt.second
train['dayofweek'] = train['datetime'].dt.dayofweek
print(train.shape)
(10886, 19)
test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['hour'] = test['datetime'].dt.hour
test['minute'] = test['datetime'].dt.minute
test['second'] = test['datetime'].dt.second
test['dayofweek'] = test['datetime'].dt.dayofweek
print(test.shape)
(6493, 16)
windspeed == 0 인 데이터를 회귀분석 예측값으로 대체
train_wind_0 = train[train.windspeed==0]
print(train_wind_0.shape)
train_wind_not0 = train[train.windspeed!=0]
print(train_wind_not0.shape)
(1313, 19)
(9573, 19)
windspeed가 0이 아닌 것들로 회귀분석 하기
feature_names_wnot0 = ['year','month','hour','season','weather','atemp','humidity']
X_train = train_wind_not0[feature_names_wnot0]
y_train = train_wind_not0['windspeed']
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
RandomForestRegressor()
X_test = train_wind_0[feature_names_wnot0]
X_test.shape
(1313, 7)
train_wind_0['windspeed'] = model.predict(X_test)
len(train_wind_0[train_wind_0['windspeed']==0])
0
train = pd.concat([train_wind_0, train_wind_not0], axis=0).sort_values(by='datetime')
test 셋 예측값 대입
- 모델이 예측한 값을 test 데이터 셋에 대입
test_wind_0 = test[test.windspeed==0]
test_wind_not0 = test[test.windspeed!=0]
X_test = test_wind_0[feature_names_wnot0]
test_wind_0['windspeed'] = model.predict(X_test)
test = pd.concat([test_wind_0, test_wind_not0], axis=0).sort_values(by='datetime')