054 정상성과 차분

키워드: 정상성, stationarity, 차분, differencing

개요

정상성(Stationarity)은 시계열 분석의 핵심 개념입니다. 많은 통계적 모델은 정상 시계열을 가정하며, 비정상 시계열은 차분(Differencing)을 통해 정상화할 수 있습니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: statsmodels, pandas, numpy

pip install statsmodels pandas numpy matplotlib scipy

정상성의 정의

정상 시계열의 조건

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller, kpss

# 054 정상 시계열 (White Noise)
np.random.seed(42)
n = 500
stationary = np.random.randn(n)

# 054 비정상 시계열 (랜덤 워크)
non_stationary = np.cumsum(np.random.randn(n))

# 054 트렌드가 있는 시계열
with_trend = np.linspace(0, 10, n) + np.random.randn(n)

# 054 시각화
fig, axes = plt.subplots(3, 2, figsize=(14, 10))

# 054 정상 시계열
axes[0, 0].plot(stationary)
axes[0, 0].set_title('Stationary (White Noise)')
axes[0, 1].hist(stationary, bins=30, edgecolor='black')
axes[0, 1].set_title('Distribution')

# 054 비정상 시계열 (랜덤 워크)
axes[1, 0].plot(non_stationary)
axes[1, 0].set_title('Non-Stationary (Random Walk)')
axes[1, 1].hist(non_stationary, bins=30, edgecolor='black')
axes[1, 1].set_title('Distribution')

# 054 트렌드 있는 시계열
axes[2, 0].plot(with_trend)
axes[2, 0].set_title('Non-Stationary (With Trend)')
axes[2, 1].hist(with_trend, bins=30, edgecolor='black')
axes[2, 1].set_title('Distribution')

plt.tight_layout()
plt.show()

# 054 정상성 조건
conditions = {
    '조건': ['상수 평균', '상수 분산', '자기공분산'],
    '설명': [
        'E[Y_t] = μ (시간에 상관없이 일정)',
        'Var(Y_t) = σ² (시간에 상관없이 일정)',
        'Cov(Y_t, Y_{t+k}) = γ_k (시간 차이에만 의존)'
    ]
}

print("정상 시계열 조건:")
print(pd.DataFrame(conditions).to_string(index=False))

이동 평균과 분산 확인

def check_stationarity_visual(series, window=50):
    """이동 평균/분산으로 정상성 시각적 확인"""

    rolling_mean = pd.Series(series).rolling(window=window).mean()
    rolling_std = pd.Series(series).rolling(window=window).std()

    fig, axes = plt.subplots(2, 1, figsize=(14, 8))

    # 원본 + 이동 평균
    axes[0].plot(series, alpha=0.5, label='Original')
    axes[0].plot(rolling_mean, 'r-', linewidth=2, label=f'Rolling Mean ({window})')
    axes[0].axhline(y=np.mean(series), color='green', linestyle='--', label='Overall Mean')
    axes[0].legend()
    axes[0].set_title('Rolling Mean')

    # 이동 표준편차
    axes[1].plot(rolling_std, 'orange', linewidth=2)
    axes[1].axhline(y=np.std(series), color='green', linestyle='--', label='Overall Std')
    axes[1].legend()
    axes[1].set_title('Rolling Standard Deviation')

    plt.tight_layout()
    plt.show()

# 054 정상 시계열 확인
print("정상 시계열:")
check_stationarity_visual(stationary)

# 054 비정상 시계열 확인
print("\n비정상 시계열 (랜덤 워크):")
check_stationarity_visual(non_stationary)

정상성 검정

ADF 검정 (Augmented Dickey-Fuller)

def adf_test(series, name=''):
    """ADF 검정 수행"""
    result = adfuller(series, autolag='AIC')

    print(f"ADF Test - {name}")
    print(f"  Test Statistic: {result[0]:.4f}")
    print(f"  p-value: {result[1]:.4f}")
    print(f"  Critical Values:")
    for key, value in result[4].items():
        print(f"    {key}: {value:.4f}")

    if result[1] < 0.05:
        print(f"  → 결론: 정상 (p < 0.05, 귀무가설 기각)")
    else:
        print(f"  → 결론: 비정상 (p >= 0.05, 귀무가설 채택)")

    return result[1] < 0.05  # True if stationary

# 054 검정 수행
print("=" * 50)
is_stationary_1 = adf_test(stationary, 'White Noise')
print()
is_stationary_2 = adf_test(non_stationary, 'Random Walk')
print()
is_stationary_3 = adf_test(with_trend, 'With Trend')

KPSS 검정

def kpss_test(series, name=''):
    """KPSS 검정 수행"""
    result = kpss(series, regression='c', nlags='auto')

    print(f"KPSS Test - {name}")
    print(f"  Test Statistic: {result[0]:.4f}")
    print(f"  p-value: {result[1]:.4f}")
    print(f"  Critical Values:")
    for key, value in result[3].items():
        print(f"    {key}: {value:.4f}")

    # KPSS는 귀무가설이 '정상'
    if result[1] < 0.05:
        print(f"  → 결론: 비정상 (p < 0.05, 귀무가설 기각)")
    else:
        print(f"  → 결론: 정상 (p >= 0.05, 귀무가설 채택)")

    return result[1] >= 0.05  # True if stationary

print("=" * 50)
kpss_test(stationary, 'White Noise')
print()
kpss_test(non_stationary, 'Random Walk')

종합 정상성 판단

def comprehensive_stationarity_test(series, name=''):
    """ADF + KPSS 종합 검정"""

    adf_result = adfuller(series, autolag='AIC')
    kpss_result = kpss(series, regression='c', nlags='auto')

    adf_stationary = adf_result[1] < 0.05
    kpss_stationary = kpss_result[1] >= 0.05

    print(f"\n종합 정상성 검정 - {name}")
    print(f"  ADF p-value: {adf_result[1]:.4f} → {'정상' if adf_stationary else '비정상'}")
    print(f"  KPSS p-value: {kpss_result[1]:.4f} → {'정상' if kpss_stationary else '비정상'}")

    if adf_stationary and kpss_stationary:
        print("  → 최종 결론: 정상")
        return 'stationary'
    elif not adf_stationary and not kpss_stationary:
        print("  → 최종 결론: 비정상")
        return 'non-stationary'
    else:
        print("  → 최종 결론: 추가 분석 필요 (트렌드 정상 가능)")
        return 'trend-stationary'

# 054 테스트
comprehensive_stationarity_test(stationary, 'White Noise')
comprehensive_stationarity_test(non_stationary, 'Random Walk')
comprehensive_stationarity_test(with_trend, 'With Trend')

차분 (Differencing)

1차 차분

# 054 차분 함수
def difference(series, periods=1):
    """차분 수행"""
    return pd.Series(series).diff(periods).dropna()

# 054 랜덤 워크에 차분 적용
diff_random_walk = difference(non_stationary, 1)

# 054 시각화
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

axes[0, 0].plot(non_stationary)
axes[0, 0].set_title('Original (Random Walk)')

axes[0, 1].plot(diff_random_walk)
axes[0, 1].set_title('After 1st Differencing')

# 054 이동 통계
axes[1, 0].plot(pd.Series(non_stationary).rolling(50).mean())
axes[1, 0].set_title('Rolling Mean - Original')

axes[1, 1].plot(diff_random_walk.rolling(50).mean())
axes[1, 1].set_title('Rolling Mean - After Differencing')

plt.tight_layout()
plt.show()

# 054 차분 후 정상성 검정
print("차분 후 정상성 검정:")
adf_test(diff_random_walk.values, 'Differenced Random Walk')

계절 차분

# 054 계절성이 있는 시계열
np.random.seed(42)
n = 730  # 2년
t = np.arange(n)
seasonal = 50 + 0.05 * t + 10 * np.sin(2 * np.pi * t / 365) + np.random.randn(n) * 3

# 1차 차분
diff1 = difference(seasonal, 1)

# 054 계절 차분 (365일)
diff_seasonal = difference(seasonal, 365)

# 1차 + 계절 차분
diff_both = difference(difference(seasonal, 1), 365)

# 054 시각화
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].plot(seasonal)
axes[0, 0].set_title('Original (Trend + Seasonality)')

axes[0, 1].plot(diff1)
axes[0, 1].set_title('After 1st Differencing')

axes[1, 0].plot(diff_seasonal)
axes[1, 0].set_title('After Seasonal Differencing (365)')

axes[1, 1].plot(diff_both)
axes[1, 1].set_title('After 1st + Seasonal Differencing')

plt.tight_layout()
plt.show()

# 054 정상성 검정
print("\n차분별 정상성:")
for name, data in [('Original', seasonal), ('1st Diff', diff1),
                   ('Seasonal Diff', diff_seasonal), ('Both', diff_both)]:
    result = adfuller(data.dropna() if hasattr(data, 'dropna') else data)
    print(f"  {name}: ADF p-value = {result[1]:.4f}")

최적 차분 차수 찾기

from statsmodels.tsa.stattools import adfuller

def find_optimal_d(series, max_d=3, significance=0.05):
    """최적 차분 차수 자동 결정"""

    current_series = pd.Series(series).copy()

    for d in range(max_d + 1):
        if d > 0:
            current_series = current_series.diff().dropna()

        # ADF 검정
        result = adfuller(current_series, autolag='AIC')
        p_value = result[1]

        print(f"d={d}: ADF p-value = {p_value:.4f}", end='')

        if p_value < significance:
            print(f" → 정상화됨")
            return d
        else:
            print(f" → 비정상")

    print(f"경고: {max_d}차 차분까지도 정상화되지 않음")
    return max_d

print("최적 차분 차수 찾기:")
print("\nRandom Walk:")
optimal_d = find_optimal_d(non_stationary)
print(f"→ 최적 d = {optimal_d}")

print("\nSeasonal + Trend:")
optimal_d = find_optimal_d(seasonal)
print(f"→ 최적 d = {optimal_d}")

로그 변환과 차분

# 054 승법적 계절성이 있는 데이터
np.random.seed(42)
multiplicative = 100 * np.exp(0.001 * t) * (1 + 0.3 * np.sin(2 * np.pi * t / 365))
multiplicative = multiplicative * (1 + np.random.randn(n) * 0.05)

# 054 로그 변환 후 차분
log_series = np.log(multiplicative)
log_diff = difference(log_series, 1)

# 054 시각화
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

axes[0, 0].plot(multiplicative)
axes[0, 0].set_title('Original (Multiplicative)')

axes[0, 1].plot(log_series)
axes[0, 1].set_title('After Log Transform')

axes[1, 0].plot(difference(multiplicative, 1))
axes[1, 0].set_title('Direct Differencing')

axes[1, 1].plot(log_diff)
axes[1, 1].set_title('Log + Differencing')

plt.tight_layout()
plt.show()

# 054 정상성 비교
print("\n로그 변환 효과:")
result1 = adfuller(difference(multiplicative, 1).dropna())
result2 = adfuller(log_diff.dropna())
print(f"  직접 차분 ADF p-value: {result1[1]:.4f}")
print(f"  로그 + 차분 ADF p-value: {result2[1]:.4f}")

차분과 FLAML

class StationaryTransformer:
    """정상화 변환기"""

    def __init__(self, max_d=2, use_log=False):
        self.max_d = max_d
        self.use_log = use_log
        self.d = 0
        self.initial_values = []

    def fit_transform(self, series):
        series = pd.Series(series).copy()

        # 로그 변환
        if self.use_log:
            series = np.log(series)

        # 최적 차분 찾기
        for d in range(self.max_d + 1):
            result = adfuller(series.dropna())
            if result[1] < 0.05:
                self.d = d
                break

            self.initial_values.append(series.iloc[0])
            series = series.diff().dropna()

        return series

    def inverse_transform(self, series):
        series = pd.Series(series).copy()

        # 역차분
        for initial in reversed(self.initial_values):
            series = series.cumsum() + initial

        # 역로그
        if self.use_log:
            series = np.exp(series)

        return series

# 054 사용 예
transformer = StationaryTransformer(max_d=2, use_log=False)
stationary_series = transformer.fit_transform(non_stationary)

print(f"필요한 차분 횟수: {transformer.d}")
print(f"변환 후 길이: {len(stationary_series)}")

정리

정상성: 평균, 분산, 자기공분산이 시간에 상관없이 일정
ADF 검정: 귀무가설 = 비정상 (p < 0.05면 정상)
KPSS 검정: 귀무가설 = 정상 (p >= 0.05면 정상)
차분: 비정상 시계열을 정상화하는 방법
계절 차분: 계절성 제거를 위한 주기적 차분
FLAML 사용 시 Lag 특성과 차분 특성 활용

다음 글 예고

다음 글에서는 ARIMA 기초에 대해 알아보겠습니다. 시계열 예측의 고전적 방법인 ARIMA 모델을 소개합니다.

FLAML AutoML 마스터 시리즈 #054

개요​

실습 환경​

정상성의 정의​

정상 시계열의 조건​

이동 평균과 분산 확인​

정상성 검정​

ADF 검정 (Augmented Dickey-Fuller)​

KPSS 검정​

종합 정상성 판단​

차분 (Differencing)​

1차 차분​

계절 차분​

최적 차분 차수 찾기​

로그 변환과 차분​

차분과 FLAML​

정리​

다음 글 예고​

개요

실습 환경

정상성의 정의

정상 시계열의 조건

이동 평균과 분산 확인

정상성 검정

ADF 검정 (Augmented Dickey-Fuller)

KPSS 검정

종합 정상성 판단

차분 (Differencing)

1차 차분

계절 차분

최적 차분 차수 찾기

로그 변환과 차분

차분과 FLAML

정리

다음 글 예고