# 시계열 분해 간단 실습
import pandas as pd
import warnings

#데이터 불러오기
data = pd.read_csv("https://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/arima_data.csv", names = ['day','price'])
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   day     60 non-null     object
 1   price   60 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ KB
None


data['day'] = pd.to_datetime(data['day'])
print(data.info())

#시간을 index로 분석할 시계열 데이터의 값을 단일 컬럼으로 만들어야됨
data.set_index('day', inplace=True)
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   day     60 non-null     datetime64[ns]
 1   price   60 non-null     int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 1.1 KB
None


import matplotlib.pyplot as plt
plt.plot(data.index, data['price'])

[<matplotlib.lines.Line2D at 0x1895e97d1f0>]


from statsmodels.tsa.seasonal import seasonal_decompose

ts = data
result = seasonal_decompose(ts, model='multiplicative')
plt.rcParams['figure.figsize'] = [12,8]
result.plot()
plt.show()


# 파이썬을 활용 정상성 검정 실습
from statsmodels.tsa.stattools import adfuller

# train, test 나눠준다
# 정상성 검정은 train 데이터에만 적용하고, 
# 모델의 성능 평가는 test 데이터를 사용하여 진행하는 것이 일반적인 접근 방법
training = data[:"2016-12-01"]
test = data.drop(training.index)


# 데이터에 명확한 추세가 보이므로  "ct" 사용
adf = adfuller(training, regression='ct')
print("statistic: {}".format(adf[0]))
print("p-value: {}".format(adf[1]))

statistic: -1.9997199341328458
p-value: 0.6015863303793819


from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

diff_data = training.diff(1)
diff_data = diff_data.dropna()
diff_data.plot()

<AxesSubplot:xlabel='day'>


# 1차 차분한 그래프가 트렌드를 보이지 않기에 매개변수는 "c"값을 적용
adf = adfuller(diff_data)
print("statistic: {}".format(adf[0]))
print("p-value: {}".format(adf[1]))

statistic: -12.094547576926395
p-value: 2.0851606399613667e-22


# 파이썬 활용 AR모형의 p값 찾기
plot_pacf(diff_data) #AR(p)의 값 확인 가능
plt.show()


# 파이썬 활용 MA모형 q값 찾기 실습
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt

plot_acf(diff_data) # MA(q)의 값 확인


#앞서 사용했던 데이터의 p,d,q(2,1,2) 값을 ARIMA에 적용시켜 모델을 예측해보자
from statsmodels.tsa.arima.model import ARIMA

model = ARIMA(training, order=(2,1,2))
res = model.fit()
res.summary()

C:\Users\82108\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
C:\Users\82108\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
C:\Users\82108\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
C:\Users\82108\anaconda3\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
C:\Users\82108\anaconda3\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
C:\Users\82108\anaconda3\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "


plt.plot(res.predict())
plt.plot(training)

[<matplotlib.lines.Line2D at 0x189644ddaf0>]


#예측 데이터
forecast_data = res.forecast(steps = len(test), alpha = 0.05)
pred_y = forecast_data
pred_y

C:\Users\82108\anaconda3\lib\site-packages\statsmodels\tsa\statespace\representation.py:374: FutureWarning: Unknown keyword arguments: dict_keys(['alpha']).Passing unknown keyword arguments will raise a TypeError beginning in version 0.15.
  warnings.warn(msg, FutureWarning)

2017-01-01    5830.344320
2017-02-01    5508.143690
2017-03-01    5883.768662
2017-04-01    5491.991051
2017-05-01    5887.992295
2017-06-01    5491.583116
2017-07-01    5887.181951
2017-08-01    5492.780201
2017-09-01    5885.864326
2017-10-01    5494.133156
2017-11-01    5884.503309
2017-12-01    5495.493515
Freq: MS, Name: predicted_mean, dtype: float64


#실제 데이터
test_y = test
test_y


plt.plot(pred_y, color='gold', label='Predict') #예상한 데이터
plt.plot(test_y, color='green', label='test') # 실제 가격 그래프
plt.legend()
plt.show()


from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

print("r2_score", r2_score(test_y, pred_y))
RMSE = mean_squared_error(test_y, pred_y)**0.5
print('RMSE', RMSE)

r2_score -1.6434021791291618
RMSE 2302.418748557072


# auto_arima 설치
!pip install pmdarima

Collecting pmdarima
  Downloading pmdarima-2.0.3-cp39-cp39-win_amd64.whl (572 kB)
Collecting statsmodels>=0.13.2
  Downloading statsmodels-0.14.0-cp39-cp39-win_amd64.whl (9.4 MB)
Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in c:\users\82108\anaconda3\lib\site-packages (from pmdarima) (0.29.24)
Requirement already satisfied: urllib3 in c:\users\82108\anaconda3\lib\site-packages (from pmdarima) (1.26.7)
Requirement already satisfied: joblib>=0.11 in c:\users\82108\anaconda3\lib\site-packages (from pmdarima) (1.2.0)
Collecting numpy>=1.21.2
  Downloading numpy-1.25.1-cp39-cp39-win_amd64.whl (15.1 MB)
Requirement already satisfied: scipy>=1.3.2 in c:\users\82108\anaconda3\lib\site-packages (from pmdarima) (1.7.1)
Requirement already satisfied: pandas>=0.19 in c:\users\82108\anaconda3\lib\site-packages (from pmdarima) (1.3.4)
Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in c:\users\82108\anaconda3\lib\site-packages (from pmdarima) (58.0.4)
Requirement already satisfied: scikit-learn>=0.22 in c:\users\82108\anaconda3\lib\site-packages (from pmdarima) (1.2.2)
Requirement already satisfied: pytz>=2017.3 in c:\users\82108\anaconda3\lib\site-packages (from pandas>=0.19->pmdarima) (2021.3)
Requirement already satisfied: python-dateutil>=2.7.3 in c:\users\82108\anaconda3\lib\site-packages (from pandas>=0.19->pmdarima) (2.8.2)
Requirement already satisfied: six>=1.5 in c:\users\82108\anaconda3\lib\site-packages (from python-dateutil>=2.7.3->pandas>=0.19->pmdarima) (1.16.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\82108\anaconda3\lib\site-packages (from scikit-learn>=0.22->pmdarima) (2.2.0)
  Downloading numpy-1.22.4-cp39-cp39-win_amd64.whl (14.7 MB)
Requirement already satisfied: patsy>=0.5.2 in c:\users\82108\anaconda3\lib\site-packages (from statsmodels>=0.13.2->pmdarima) (0.5.2)
Collecting packaging>=21.3
  Downloading packaging-23.1-py3-none-any.whl (48 kB)
Installing collected packages: numpy, packaging, statsmodels, pmdarima
  Attempting uninstall: numpy
    Found existing installation: numpy 1.20.3
    Uninstalling numpy-1.20.3:
      Successfully uninstalled numpy-1.20.3
  Attempting uninstall: packaging
    Found existing installation: packaging 21.0
    Uninstalling packaging-21.0:
      Successfully uninstalled packaging-21.0
  Attempting uninstall: statsmodels
    Found existing installation: statsmodels 0.12.2
    Uninstalling statsmodels-0.12.2:
      Successfully uninstalled statsmodels-0.12.2
Successfully installed numpy-1.22.4 packaging-23.1 pmdarima-2.0.3 statsmodels-0.14.0

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.3.0 requires daal==2021.2.3, which is not installed.
numba 0.54.1 requires numpy<1.21,>=1.17, but you have numpy 1.22.4 which is incompatible.


#auto_arima 함수 호출
from pmdarima import auto_arima

# trace는 결과값에 학습정보를 표현하기 위해 True로 사용
auto_model = auto_arima(training, start_p=0, d=1, start_q=0,
                        max_p=3, max_q=3,
                        start_P=0, start_Q=0, max_P=3, max_Q=3, m=12,
                        seasonal=True, information_criterion='aic',
                        trace = True)

Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=481.846, Time=0.02 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=482.652, Time=0.06 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=482.466, Time=0.07 sec
 ARIMA(0,1,0)(1,1,0)[12]             : AIC=483.637, Time=0.04 sec
 ARIMA(0,1,0)(0,1,1)[12]             : AIC=483.669, Time=0.03 sec
 ARIMA(0,1,0)(1,1,1)[12]             : AIC=inf, Time=0.15 sec
 ARIMA(1,1,0)(0,1,0)[12]             : AIC=481.031, Time=0.03 sec
 ARIMA(1,1,0)(0,1,1)[12]             : AIC=482.740, Time=0.07 sec
 ARIMA(1,1,0)(1,1,1)[12]             : AIC=inf, Time=0.23 sec
 ARIMA(2,1,0)(0,1,0)[12]             : AIC=482.616, Time=0.04 sec
 ARIMA(1,1,1)(0,1,0)[12]             : AIC=482.682, Time=0.07 sec
 ARIMA(0,1,1)(0,1,0)[12]             : AIC=480.687, Time=0.03 sec
 ARIMA(0,1,1)(1,1,0)[12]             : AIC=482.403, Time=0.07 sec
 ARIMA(0,1,1)(1,1,1)[12]             : AIC=inf, Time=0.19 sec
 ARIMA(0,1,2)(0,1,0)[12]             : AIC=482.683, Time=0.04 sec
 ARIMA(1,1,2)(0,1,0)[12]             : AIC=inf, Time=0.07 sec
 ARIMA(0,1,1)(0,1,0)[12] intercept   : AIC=482.687, Time=0.03 sec

Best model:  ARIMA(0,1,1)(0,1,0)[12]          
Total fit time: 1.255 seconds


auto_model.summary()


#학습셋으로 부터 test데이터 길이 만큼 예측
auto_pred_y = pd.DataFrame(auto_model.predict(n_periods=len(test)),
                        index = test.index)
auto_pred_y.columns = ['predicted_price']
auto_pred_y


plt.figure(figsize=(10,6))
plt.plot(training, label='Train') # Train 데이터
plt.plot(auto_pred_y, label='Prediction') # 모델이 예측한 그래프
plt.plot(test, label='Test') # 실제 가격 그래프
plt.legend(loc='upper left')
plt.show()


from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

print("r2_score", r2_score(test_y, auto_pred_y))
RMSE = mean_squared_error(test_y, auto_pred_y)**0.5
print("RMSE", RMSE)

r2_score 0.9305467077215415
RMSE 373.206423341773

	day	price
0	2013-01-01	3794
1	2013-02-01	3863
2	2013-03-01	5190
3	2013-04-01	5783
4	2013-05-01	6298

	price
day
2013-01-01	3794
2013-02-01	3863
2013-03-01	5190
2013-04-01	5783
2013-05-01	6298

Dep. Variable:	price	No. Observations:	48
Model:	ARIMA(2, 1, 2)	Log Likelihood	-375.875
Date:	Tue, 25 Jul 2023	AIC	761.750
Time:	14:59:50	BIC	771.001
Sample:	01-01-2013	HQIC	765.231
	- 12-01-2016
Covariance Type:	opg

	coef	std err	z	P>\|z\|	[0.025	0.975]
ar.L1	-1.3167	0.190	-6.933	0.000	-1.689	-0.944
ar.L2	-0.3190	0.191	-1.673	0.094	-0.693	0.055
ma.L1	1.9700	0.243	8.109	0.000	1.494	2.446
ma.L2	0.9949	0.242	4.119	0.000	0.522	1.468
sigma2	4.457e+05	1.13e-06	3.93e+11	0.000	4.46e+05	4.46e+05

Ljung-Box (L1) (Q):	0.11	Jarque-Bera (JB):	0.38
Prob(Q):	0.74	Prob(JB):	0.83
Heteroskedasticity (H):	1.49	Skew:	-0.21
Prob(H) (two-sided):	0.44	Kurtosis:	2.89

	price
day
2017-01-01	5236
2017-02-01	5299
2017-03-01	6744
2017-04-01	7927
2017-05-01	8561
2017-06-01	8930
2017-07-01	9960
2017-08-01	8548
2017-09-01	7843
2017-10-01	7620
2017-11-01	7676
2017-12-01	5809

Ljung-Box (L1) (Q):	0.00	Jarque-Bera (JB):	1.15
Prob(Q):	0.95	Prob(JB):	0.56
Heteroskedasticity (H):	1.56	Skew:	-0.14
Prob(H) (two-sided):	0.45	Kurtosis:	2.16

	predicted_price
day
2017-01-01	5609.436974
2017-02-01	5761.436974
2017-03-01	7225.436974
2017-04-01	8298.436974
2017-05-01	8841.436974
2017-06-01	9452.436974
2017-07-01	10359.436974
2017-08-01	8777.436974
2017-09-01	8068.436974
2017-10-01	7832.436974
2017-11-01	7935.436974
2017-12-01	6279.436974

	coef	std err	z	P>\|z\|	[0.025	0.975]
ma.L1	-0.3185	0.177	-1.801	0.072	-0.665	0.028
sigma2	4.803e+04	1.64e+04	2.924	0.003	1.58e+04	8.02e+04

시계열 분석 기초(arima, sarima)

1. 시계열 분해¶

시계열 자료를 추세, 계절성, 잔차로 분해하는 기법!!!¶

시간의 요인은 추세, 계절성 이며¶

외부요인은 잔차(불규칙요인)가 있음¶

1) 모형 판단¶

추세와 계정성이 별개로 존재하면 Additive모형을¶

추세에 따라 계절성이 있으면 Multiplicative 모형을 적용¶

day타입이 object이므로 시계열 분석을 위해 datetime 형식으로 변환해주자!!!!¶

일단 데이터를 시각화 해보면..¶

1) 오 추세에 따라 계절성이 존재하는구먼¶

2) 오 시간이 지날수록 커지는 구먼¶

결론) Multiplicative를 적용해 시계열 분해를 할 수 있다!!!!¶

시각화 확인시 추세와 계절성이 명확히 존재¶

불규칙 요인은 거의 없는것을 볼수 있음!!!!¶

이제 시간에 따른 미래 값을 예측, 현상을 분석해보자¶

2) 정상성 변환¶

정상성 : 평균, 분산이 시간에 따라 일정한 성질을 가지고 있는것, 시계열 데이터 특성이 시간의 흐름에 따라 변하지 않는 상태¶

즉, 앞서 보았던 시계열의 경우 추세나 계절성이 있는 시계열은 정상 시계열이 아님!!!¶

비정상 시계열은 ARIMA모형을 적용할수 없으니 정상 시계열로 변환해 줘야 되는데 대표적으로 로그변환, 차분이 있음!!!¶

로그변환 >> 분산(변동폭)이 일정하지 않는 경우에 사용!!¶

차분 >> 로그변환 후 추세, 계절성이 존재하는 경우 추세와 계절성을 없애기 위해 차분을 사용!!!¶

정상성 검정 실습¶

정상성을 검정하기 위해 Augmented Dickey-Fuller Test를 해야됨¶

사용 패키지는 statsmodels.tsa.stattools에 있는 adfuller함수 이다.¶

귀무가설 : 데이터가 정상성을 갖지 않음¶

대립가설 : 데이터가 정상성을 가짐¶

ct값을 적용 regression검정 결과 p-value값이 유의수준 0.05보다 높다!!!¶

그러므로 귀무가설 채택 즉, 해당 데이터는 정상성을 갖지 않는다고 할 수 있다!!!!¶

비정상 시계열을 정상시계열로 변환하기 위해 1차 차분, 로그변환 해야됨¶

검정결과 p-value값이 0.05보다 작으므로 귀무가설 기각 대립가설을 채택한다¶

즉, 정상성을 가진다고 볼수있으므로, 1차 차분으로 정상시계열로 변환했으면 이제 AR과 MA모형에 대해 시작해보자¶

3) AR모형과 MA모형¶

(1) AR모형¶

자기회귀과정, 현 시점의 데이터를 이전의 데이터들의 상관성으로 나타내는 모형이다!!!¶

과거의 값이 현재 값에 얼마나 영향을 미쳤는지 파악하는것, 이떄 최적의 성능을 가지는 모델을 만들 수 있는 과거의 값을 찾게 되는데 이를 p라고 하고 AR(p) 모형이라고 함¶

PACF : 편자기상관 함수, PACF는 ACF와 다르게 시차가 다른 두 시계열 데이터 간의 순수한 상호 연관성을 나타냄, 그러므로 PACF값이 0에 수렴할 떄의 p값을 AR모형의 p값으로 설정한다!!!¶

PACF 값을 확인시, 약 시차 2 이후에 0에 수렴하는 것을 알 수 있음!!!¶

즉, AR모형에 최적의 p값을 2로 설정!!!¶

PACF는 과거 t시점까지의 데이터와 현재시점의 데이터와의 상관성이다!!!¶

(2) MA모형¶

과거의 예측 오차들의 가중이동평균으로 현재 시점의 데이터를 표현하는 모형!!!¶

즉, 과거의 예측 오차를 이용해 미래를 예측하는 모형이라고 할 수 있다!!!¶

과거 예측 오차들의 따라 가중이동평균은 달라짐, 그렇기에 MA모형은 최적의 모형이 되는 구간을 구하는 것이 중요¶

MA모형이 최적이 되게끔 하는 변수가 q이며 이 모형을 MA(q)모형 이라함!!!¶

ACF : 자기상관 함수로 시차에 따른 자기상관성을 의미¶

ACF 값을 시차에 따라 그래프로 시작화 해보면 최적의 q값을 찾을 수 있음(0으로 수혐할 떄 시차를 q로 설정)¶

acf값을 확인했을때 약 시차2 이후에 0에 수렴하는 것을 알 수 있다. 즉, MA모형에서 최적의 q값은 2로 설정할 수 있다.¶

4) ARIMA¶

비정상적 시계열 자료에 대해 분석하는 모형¶

차분을 사용해 비정상 시계열을 정상 시계열로 만듬다¶

정상 시계열의 경우 AR모형과 MA모형이 상호변황이 가능하기에 이 두 모형을 결합하여 과거의 시점의 데이터로 현재, 미래의 시점의 데이터를 예측하는 모형¶

ARIMA모형 파라미터는 p,d,q 를 사용!!!¶

d는 정상성을 가지게 될 때까지 사용되는 차분 횟수¶

ARIMA 모형은 시계열 자료외 다른 자료가 없을시, 그 변동 상태를 확인할 수 있다는 장점을 가짐, 어떤 시계열에도 적용이 가능한 모델이라는 장점이 있음!!!¶

ARIMA 파이썬 실습¶

ARIMA 모형에서 사용할 파라미터 p,d,q¶

p는 AR모형(PACF)으로, d는 차분횟수, q는 MA모형(ACF)로 최적화할 수 있음!!!!¶

모델의 AIC값을 비교하며 최적의 모델을 찾는 방법도 있으나, ARIMA에는 위와 같은 방법을 사용하고 설명하는 것이 일반적임!!!¶

(1) ARIMA함수 호출 : from statsmodels.tsa.arima.model import ARIMA¶

변환 되지 않은 training 세트를(2,1,2) 로 학습시킨 결과를 회귀분석에서 보았떤 결과가 나타난다.¶

주의 깊에 봐야할 부분은¶

AIC와 AR,MA모델의 p-value이다.¶

1) AIC는 다른 모델과 비교할 떄 사용할 수 있으며 AIC가 작을수록 모델의 성능이 좋다고 할수 있다!!!¶

2) coef에서 ar,ma가 p-value 0.05이하이면 ar과ma 모형을 사용할 수 있다는 것이다.¶

3) ar,ma 뒤에 있는 L1,L2는 모델에서 사용하는 시차의 개념, 만약 p,d,q 에서 p값이 5라면(여기서는 2임) ar.L1~ar.L5 변수가 모델에서 사용된다!!!¶

모델 학습정도를 확인을 위해, 학습된 모델인 res에서 학습시킨 데이터를 예측해 보았다!!!¶

training데이터를 학습시키고 확인시, 그래프 모양이 나름 일치하므로 과소적합은 의심되지 않음!!!¶

¶

이제 학습된 모델로 아까 분리한(test) 2017년 데이터를 예측해보자¶

ARIMA에서는 예측할떄 forecast()함수를 주로 사용!!!¶

예측할 만큼의 길이를 steps에 할당, alpha값은 유의수준이며 0.05로 주로 사용¶

잘 예측 했는지 시각화하여 확인해보자¶

확인 해보면 별로 예측은 잘못했는디..... r2_score값 과 rmse 값을 확인해보자¶

R-squared 값은 모델이 얼마나 데이터를 잘 설명하는지를 나타내는 성능 지표이고, RMSE는 모델의 예측 오차 크기를 나타내는 성능 지표¶

확인해보면 r2값이 음수가 나오는데 모델의 정확도가 매우 낮다는 것....ㅠㅠ¶

사용한 데이터와 같이 계절성이 있는 경우는 계절성지수가 추가된 SARIMA 모델을 사용 하는 것이 좋다!!!¶

5) SARIMA¶

sarima는 데이터가 지닌 계절성 까지 고려한 arima모델!!!¶

sarima 파이썬 실습¶