시계열 데이터 분석에 필요한 개념들을 요약하고 정리하는 시리즈 - 4. 다변량 시계열 모델 (ARCH, VAR, VECM) 편
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tsa.api import VAR
mdata = sm.datasets.macrodata.load_pandas().data
mdata
from statsmodels.tsa.base.datetools import dates_from_str
# 실수형 -> 정수형 -> 문자열로 변환
dates = mdata.loc[:,['year','quarter']].astype(int).astype(str)
# 연도Q분기로 변환
y_q = dates['year']+'Q'+dates['quarter']
# datetime형태의 자료가 담긴 리스트로 반환
year_quart = dates_from_str(y_q)
print(year_quart[:4])
print(len(year_quart))
'''
[datetime.datetime(1959, 3, 31, 0, 0),
datetime.datetime(1959, 6, 30, 0, 0),
datetime.datetime(1959, 9, 30, 0, 0),
datetime.datetime(1959, 12, 31, 0, 0)]
203
'''
# 필요한 column만 추출
cols = ['realgdp','realcons','realinv']
data = mdata[cols]
# 인덱스틑 datetime리스트로 설정
data.index = pd.DatetimeIndex(year_quart)
data.head()
data.plot()
plt.show()
diff_data = np.log(data).diff().dropna()
diff_data.plot()
plt.show()
from statsmodels.tsa.stattools import adfuller
# Stationarity Check
# (H0): non-stationary
# (H1): stationary
def adf(time_series):
result = adfuller(time_series.values)
print('ADF Statistic: {:.4f}'.format(result[0]))
print('p-value: {:.4f}'.format(result[1]))
for i in diff_data:
print('--Test statistic for %s' % i)
adf(diff_data[i])
print()
'''
--Test statistic for realgdp
ADF Statistic: -6.9729
p-value: 0.0000
--Test statistic for realcons
ADF Statistic: -4.9920
p-value: 0.0000
--Test statistic for realinv
ADF Statistic: -12.2190
p-value: 0.0000
'''
X_train = diff_data[:int(len(diff_data)*0.9)]
X_test = diff_data[int(len(diff_data)*0.9):]
print(X_train.shape, X_test.shape)
'''
(181, 3) (21, 3)
'''
model_var = VAR(endog=X_train)
var_fit = model_var.fit(maxlags=3, ic='aic')
var_fit.summary()
'''
Summary of Regression Results
==================================
Model: VAR
Method: OLS
Date: Thu, 30, Mar, 2023
Time: 15:31:16
--------------------------------------------------------------------
No. of Equations: 3.00000 BIC: -27.6469
Nobs: 180.000 HQIC: -27.7734
Log likelihood: 1753.15 FPE: 7.95563e-13
AIC: -27.8597 Det(Omega_mle): 7.44798e-13
--------------------------------------------------------------------
Results for equation realgdp
==============================================================================
coefficient std. error t-stat prob
------------------------------------------------------------------------------
const 0.004186 0.001007 4.158 0.000
L1.realgdp -0.333564 0.185854 -1.795 0.073
L1.realcons 0.703384 0.141602 4.967 0.000
L1.realinv 0.053481 0.027982 1.911 0.056
==============================================================================
Results for equation realcons
==============================================================================
coefficient std. error t-stat prob
------------------------------------------------------------------------------
const 0.006896 0.000852 8.095 0.000
L1.realgdp -0.073753 0.157241 -0.469 0.639
L1.realcons 0.248239 0.119802 2.072 0.038
L1.realinv 0.031390 0.023675 1.326 0.185
==============================================================================
Results for equation realinv
==============================================================================
coefficient std. error t-stat prob
------------------------------------------------------------------------------
const -0.012571 0.005163 -2.435 0.015
L1.realgdp -2.493790 0.953078 -2.617 0.009
L1.realcons 4.574692 0.726149 6.300 0.000
L1.realinv 0.300710 0.143497 2.096 0.036
==============================================================================
Correlation matrix of residuals
realgdp realcons realinv
realgdp 1.000000 0.607436 0.763364
realcons 0.607436 1.000000 0.142382
realinv 0.763364 0.142382 1.000000
'''
# lag order 값 산출 (적분차수)
var_lag_order = var_fit.k_ar
var_fit.forecast(X_train.values[-var_lag_order:], steps=len(X_test))
'''
array([[0.00788875, 0.00904605, 0.00720955],
[0.00830317, 0.00878588, 0.01130695],
[0.00820107, 0.00881935, 0.01031542],
[0.00820564, 0.00880406, 0.01042498],
[0.00819923, 0.00880337, 0.01037661],
[0.00819829, 0.00880215, 0.01037489],
[0.00819766, 0.00880187, 0.01037114],
[0.00819747, 0.00880173, 0.01037028],
[0.00819738, 0.00880168, 0.01036985],
[0.00819735, 0.00880166, 0.0103697 ],
[0.00819734, 0.00880165, 0.01036965],
[0.00819734, 0.00880165, 0.01036962],
[0.00819734, 0.00880165, 0.01036962],
[0.00819734, 0.00880165, 0.01036961],
[0.00819734, 0.00880165, 0.01036961],
[0.00819734, 0.00880165, 0.01036961],
[0.00819734, 0.00880165, 0.01036961],
[0.00819734, 0.00880165, 0.01036961],
[0.00819734, 0.00880165, 0.01036961],
[0.00819734, 0.00880165, 0.01036961],
[0.00819734, 0.00880165, 0.01036961]])
'''
var_fit.plot_forecast(steps=len(X_test))
plt.show()
# 예측 결과를 데이터프레임으로 만들기
pred_var = var_fit.forecast(X_train.values[-1:], steps=len(X_test))
pred_df = pd.DataFrame(pred_var, index=X_test.index, columns=X_test.columns + '_pred')
# 평가 (MAE, MSE, RMSE)
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
#Calculate mean absolute error
mae = mean_absolute_error(X_test['realgdp'],pred_df['realgdp_pred'])
print('MAE: %f' % mae)
#Calculate mean squared error and root mean squared error
mse = mean_squared_error(X_test['realgdp'], pred_df['realgdp_pred'])
print('MSE: %f' % mse)
rmse = np.sqrt(mse)
print('RMSE: %f' % rmse)
'''
MAE: 0.005971
MSE: 0.000082
RMSE: 0.009054
'''
from statsmodels.tsa.vector_ar import vecm
## Statistical Test for Cointegration (VECM 공적분 테스트)
## 귀무가설 : 공적분 특성없다, 대립가설 : 공적분 특성 있다
vec_rank = vecm.select_coint_rank(X_train, det_order = 1, k_ar_diff = 1, signif=0.01)
print(vec_rank.summary())
'''
Johansen cointegration test using trace test statistic with 1% significance level
=====================================
r_0 r_1 test statistic critical value
-------------------------------------
0 3 208.6 41.08
1 3 105.7 23.15
2 3 37.84 6.635
-------------------------------------
'''
vecm = vecm.VECM(endog = X_train, k_ar_diff = 9, coint_rank = 3, deterministic = 'ci')
vecm_fit = vecm.fit()
vecm_fit.predict(steps=10)
'''
array([[ 0.00855749, 0.00808964, 0.01745962],
[ 0.00637704, 0.00811118, 0.00024825],
[ 0.00666643, 0.00782071, 0.00703061],
[ 0.00586009, 0.00686629, 0.00285183],
[ 0.00603568, 0.00852688, -0.00404639],
[ 0.0102598 , 0.0111991 , 0.01528229],
[ 0.01035375, 0.01132725, 0.02284169],
[ 0.01152781, 0.00997501, 0.02785296],
[ 0.00895585, 0.00876293, 0.01774089],
[ 0.00853485, 0.00855157, 0.01511429]])
'''
vecm_fit.plot_forecast(steps=10, n_last_obs=50);