065 시계열 파트 총정리

키워드: 시계열 정리, 종합, 체크리스트

개요

이 글에서는 시계열 파트(051~064번)에서 다룬 내용을 종합적으로 정리합니다. 시계열 예측 프로젝트의 전체 흐름과 핵심 개념을 복습합니다.

시계열 예측 파이프라인

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from flaml import AutoML
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 065 전체 파이프라인 예시
np.random.seed(42)
n_days = 730

dates = pd.date_range('2022-01-01', periods=n_days, freq='D')
trend = np.linspace(100, 200, n_days)
seasonal = 30 * np.sin(np.arange(n_days) * 2 * np.pi / 365)
weekly = 10 * np.sin(np.arange(n_days) * 2 * np.pi / 7)
noise = np.random.randn(n_days) * 15

values = trend + seasonal + weekly + noise

df = pd.DataFrame({'date': dates, 'value': values})
print("시계열 데이터 준비 완료")
print(f"기간: {df['date'].min().date()} ~ {df['date'].max().date()}")

1단계: 데이터 이해 (EDA)

def time_series_eda(df, date_col='date', target_col='value'):
    """시계열 EDA 종합"""
    print("=" * 50)
    print("시계열 데이터 탐색적 분석")
    print("=" * 50)

    # 기본 통계
    print("\n[기본 통계]")
    print(df[target_col].describe())

    # 결측치
    print(f"\n[결측치]")
    print(f"  날짜: {df[date_col].isna().sum()}")
    print(f"  값: {df[target_col].isna().sum()}")

    # 시간 범위
    print(f"\n[시간 범위]")
    print(f"  시작: {df[date_col].min()}")
    print(f"  종료: {df[date_col].max()}")
    print(f"  기간: {(df[date_col].max() - df[date_col].min()).days}일")

    # 시각화
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))

    # 전체 추이
    axes[0, 0].plot(df[date_col], df[target_col])
    axes[0, 0].set_title('Time Series')

    # 분포
    axes[0, 1].hist(df[target_col], bins=50, edgecolor='black')
    axes[0, 1].set_title('Distribution')

    # 자기상관
    from pandas.plotting import autocorrelation_plot
    autocorrelation_plot(df[target_col], ax=axes[1, 0])
    axes[1, 0].set_title('Autocorrelation')

    # 월별 박스플롯
    df_temp = df.copy()
    df_temp['month'] = df_temp[date_col].dt.month
    df_temp.boxplot(column=target_col, by='month', ax=axes[1, 1])
    axes[1, 1].set_title('Monthly Distribution')

    plt.tight_layout()
    plt.show()

time_series_eda(df)

2단계: 전처리

def preprocess_time_series(df, date_col='date', target_col='value'):
    """시계열 전처리 파이프라인"""
    df = df.copy()

    # 1. 정렬
    df = df.sort_values(date_col).reset_index(drop=True)

    # 2. 결측치 처리
    if df[target_col].isna().sum() > 0:
        df[target_col] = df[target_col].interpolate(method='linear')

    # 3. 이상치 처리 (IQR)
    Q1, Q3 = df[target_col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = (df[target_col] < lower) | (df[target_col] > upper)
    if outliers.sum() > 0:
        df.loc[outliers, target_col] = df[target_col].rolling(7, center=True).median()[outliers]
        print(f"  이상치 {outliers.sum()}개 처리")

    # 4. 날짜 연속성 확인
    date_diff = df[date_col].diff().dt.days
    gaps = date_diff[date_diff > 1]
    if len(gaps) > 0:
        print(f"  날짜 갭 {len(gaps)}개 발견")

    return df

df_clean = preprocess_time_series(df)
print("전처리 완료")

3단계: 분해 및 정상성 검정

def analyze_components(df, target_col='value', period=365):
    """시계열 분해 및 정상성 분석"""
    from statsmodels.tsa.seasonal import seasonal_decompose
    from statsmodels.tsa.stattools import adfuller

    # 분해
    decomposition = seasonal_decompose(df[target_col], model='additive', period=min(period, len(df)//2))

    fig, axes = plt.subplots(4, 1, figsize=(14, 12))
    decomposition.observed.plot(ax=axes[0], title='Observed')
    decomposition.trend.plot(ax=axes[1], title='Trend')
    decomposition.seasonal.plot(ax=axes[2], title='Seasonal')
    decomposition.resid.plot(ax=axes[3], title='Residual')
    plt.tight_layout()
    plt.show()

    # ADF 테스트
    result = adfuller(df[target_col].dropna())
    print("\n[정상성 검정 (ADF)]")
    print(f"  통계량: {result[0]:.4f}")
    print(f"  p-value: {result[1]:.4f}")
    print(f"  정상성: {'Yes' if result[1] < 0.05 else 'No (차분 필요)'}")

    return decomposition

decomp = analyze_components(df_clean)

4단계: 특성 엔지니어링

def create_ts_features(df, date_col='date', target_col='value'):
    """종합 시계열 특성 생성"""
    df = df.copy()

    # 날짜 특성
    df['dayofweek'] = df[date_col].dt.dayofweek
    df['month'] = df[date_col].dt.month
    df['dayofyear'] = df[date_col].dt.dayofyear
    df['weekofyear'] = df[date_col].dt.isocalendar().week.astype(int)
    df['quarter'] = df[date_col].dt.quarter

    # 주기적 인코딩
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['dow_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)

    # 비즈니스 특성
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_month_end'] = df[date_col].dt.is_month_end.astype(int)

    # Lag 특성
    for lag in [1, 7, 14, 28, 30]:
        df[f'lag_{lag}'] = df[target_col].shift(lag)

    # 이동 통계
    for window in [7, 14, 30]:
        df[f'rolling_mean_{window}'] = df[target_col].shift(1).rolling(window).mean()
        df[f'rolling_std_{window}'] = df[target_col].shift(1).rolling(window).std()

    # 차분
    df['diff_1'] = df[target_col].diff(1)
    df['diff_7'] = df[target_col].diff(7)

    # 결측치 제거
    df = df.dropna()

    return df

df_features = create_ts_features(df_clean)
print(f"특성 생성 완료: {df_features.shape[1]}개 컬럼")

feature_cols = [col for col in df_features.columns
                if col not in ['date', 'value']]
print(f"학습 특성: {len(feature_cols)}개")

5단계: 시계열 교차 검증

from sklearn.model_selection import TimeSeriesSplit

def time_series_cv_evaluate(model, X, y, n_splits=5):
    """시계열 교차 검증"""
    tscv = TimeSeriesSplit(n_splits=n_splits)

    fold_scores = []
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X), 1):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)

        mae = mean_absolute_error(y_val, y_pred)
        fold_scores.append(mae)
        print(f"  Fold {fold}: MAE = {mae:.2f}")

    print(f"\n  평균 MAE: {np.mean(fold_scores):.2f} (±{np.std(fold_scores):.2f})")
    return fold_scores

# 065 CV 예시
from sklearn.ensemble import RandomForestRegressor
X = df_features[feature_cols]
y = df_features['value']

print("시계열 교차 검증:")
scores = time_series_cv_evaluate(RandomForestRegressor(n_estimators=50, random_state=42), X, y)

6단계: FLAML 모델 학습

# 065 시간 기반 분할
train_end = df_features['date'].max() - pd.Timedelta(days=60)
train = df_features[df_features['date'] <= train_end]
test = df_features[df_features['date'] > train_end]

X_train = train[feature_cols]
y_train = train['value']
X_test = test[feature_cols]
y_test = test['value']

print(f"학습: {len(train)}, 테스트: {len(test)}")

# 065 FLAML 학습
automl = AutoML()
automl.fit(
    X_train, y_train,
    task="regression",
    time_budget=60,
    metric="mae",
    split_type="time",
    n_splits=5,
    verbose=1
)

y_pred = automl.predict(X_test)

print(f"\n최적 모델: {automl.best_estimator}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")

7단계: 평가 및 시각화

def evaluate_forecast(y_true, y_pred, dates=None):
    """종합 예측 평가"""
    metrics = {
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'R²': r2_score(y_true, y_pred),
        'MAPE': np.mean(np.abs((y_true - y_pred) / y_true)) * 100,
        'Bias': np.mean(y_pred - y_true)
    }

    print("예측 성능 지표:")
    for name, value in metrics.items():
        print(f"  {name}: {value:.2f}")

    # 시각화
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))

    # 실제 vs 예측
    if dates is not None:
        axes[0, 0].plot(dates, y_true, label='Actual')
        axes[0, 0].plot(dates, y_pred, label='Predicted', alpha=0.8)
    else:
        axes[0, 0].plot(y_true.values, label='Actual')
        axes[0, 0].plot(y_pred, label='Predicted', alpha=0.8)
    axes[0, 0].legend()
    axes[0, 0].set_title('Actual vs Predicted')

    # 잔차
    residuals = y_true - y_pred
    axes[0, 1].scatter(y_pred, residuals, alpha=0.5)
    axes[0, 1].axhline(y=0, color='red', linestyle='--')
    axes[0, 1].set_xlabel('Predicted')
    axes[0, 1].set_ylabel('Residual')
    axes[0, 1].set_title('Residual Plot')

    # 잔차 분포
    axes[1, 0].hist(residuals, bins=30, edgecolor='black')
    axes[1, 0].set_title('Residual Distribution')

    # 실제 vs 예측 산점도
    axes[1, 1].scatter(y_true, y_pred, alpha=0.5)
    min_val = min(y_true.min(), y_pred.min())
    max_val = max(y_true.max(), y_pred.max())
    axes[1, 1].plot([min_val, max_val], [min_val, max_val], 'r--')
    axes[1, 1].set_xlabel('Actual')
    axes[1, 1].set_ylabel('Predicted')
    axes[1, 1].set_title('Actual vs Predicted Scatter')

    plt.tight_layout()
    plt.show()

    return metrics

metrics = evaluate_forecast(y_test, y_pred, test['date'])

핵심 개념 체크리스트

checklist = {
    '단계': [
        '1. 데이터 이해',
        '2. 전처리',
        '3. 분해/정상성',
        '4. 특성 엔지니어링',
        '5. 교차 검증',
        '6. 모델링',
        '7. 평가'
    ],
    '핵심 작업': [
        'EDA, 패턴 파악, 이상치 확인',
        '결측치, 이상치, 날짜 연속성',
        '추세/계절성 분리, ADF 검정',
        'Lag, 이동통계, 주기인코딩',
        'TimeSeriesSplit (미래누출 방지)',
        'FLAML split_type="time"',
        'MAE, RMSE, MAPE, 잔차분석'
    ],
    '주의사항': [
        '충분한 시간 확보',
        '보간 시 미래값 사용 금지',
        '비정상성 시 차분 적용',
        'shift(1)로 데이터 누출 방지',
        'K-Fold 사용 금지',
        '시간 순서 유지',
        '여러 지표 종합 판단'
    ]
}

print("\n시계열 예측 체크리스트:")
print(pd.DataFrame(checklist).to_string(index=False))

방법론 선택 가이드

methods = {
    '상황': [
        '단기 예측 (1-7일)',
        '중장기 예측 (1-3개월)',
        '강한 계절성',
        '다변량 데이터',
        '비선형 패턴',
        '빠른 프로토타입'
    ],
    '추천 방법': [
        'FLAML + Lag 특성',
        'Prophet, ARIMA',
        'SARIMA, Prophet',
        'FLAML + 외생변수',
        'FLAML (트리 기반)',
        'FLAML AutoML'
    ],
    '핵심 특성': [
        'lag_1~7, rolling_mean_7',
        '계절성 성분, 추세',
        '월/주 주기인코딩',
        '다변량 Lag, 상관특성',
        '모든 특성 조합',
        '기본 날짜/Lag 특성'
    ]
}

print("\n방법론 선택 가이드:")
print(pd.DataFrame(methods).to_string(index=False))

프로젝트 응용

projects = {
    '프로젝트': ['주가 예측', '수요 예측', '에너지 예측'],
    '핵심 특성': [
        'RSI, MACD, 볼린저밴드',
        '제품ID, 프로모션, 계절성',
        '시간대, 기온, 요일'
    ],
    '주요 지표': [
        '방향 정확도, 수익률',
        'MAPE, 안전재고',
        '피크 MAE, 비용'
    ],
    '비즈니스 활용': [
        '매매 신호, 리스크 관리',
        '재고 최적화, 발주량',
        '피크 관리, 비용 절감'
    ]
}

print("\n프로젝트 응용:")
print(pd.DataFrame(projects).to_string(index=False))

종합 파이프라인 코드

def complete_ts_pipeline(df, date_col='date', target_col='value',
                         test_days=60, time_budget=60):
    """완전한 시계열 예측 파이프라인"""

    # 1. 전처리
    df = df.sort_values(date_col).reset_index(drop=True)

    # 2. 특성 생성
    df_feat = create_ts_features(df, date_col, target_col)

    feature_cols = [col for col in df_feat.columns
                    if col not in [date_col, target_col]]

    # 3. 분할
    train_end = df_feat[date_col].max() - pd.Timedelta(days=test_days)
    train = df_feat[df_feat[date_col] <= train_end]
    test = df_feat[df_feat[date_col] > train_end]

    X_train = train[feature_cols]
    y_train = train[target_col]
    X_test = test[feature_cols]
    y_test = test[target_col]

    # 4. FLAML 학습
    automl = AutoML()
    automl.fit(
        X_train, y_train,
        task="regression",
        time_budget=time_budget,
        metric="mae",
        split_type="time",
        verbose=0
    )

    # 5. 예측 및 평가
    y_pred = automl.predict(X_test)

    results = {
        'model': automl.best_estimator,
        'mae': mean_absolute_error(y_test, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
        'r2': r2_score(y_test, y_pred),
        'predictions': y_pred,
        'test_dates': test[date_col]
    }

    return results, automl

# 065 파이프라인 실행
results, model = complete_ts_pipeline(df, time_budget=30)
print("\n종합 파이프라인 결과:")
print(f"  모델: {results['model']}")
print(f"  MAE: {results['mae']:.2f}")
print(f"  RMSE: {results['rmse']:.2f}")
print(f"  R²: {results['r2']:.4f}")

정리

시계열 파트에서 배운 내용:

기초 (051-054): 시계열 특성, 전처리, 분해, 정상성
모델링 (055-057): ARIMA, FLAML, Prophet
고급 (058-060): 특성 엔지니어링, 교차 검증, 다변량
프로젝트 (061-063): 주가, 수요, 에너지 예측
앙상블 (064): 모델 결합 기법

핵심 원칙:

데이터 누출 방지: shift(), split_type="time"
적절한 특성: Lag, 이동통계, 주기 인코딩
올바른 검증: TimeSeriesSplit
종합 평가: MAE, RMSE, MAPE, 잔차 분석

다음 파트 예고

다음 파트에서는 FLAML 고급 기능을 다룹니다. 하이퍼파라미터 튜닝, 커스텀 모델, 분산 학습 등 FLAML의 고급 활용법을 알아보겠습니다.

FLAML AutoML 마스터 시리즈 #065

개요​

시계열 예측 파이프라인​

1단계: 데이터 이해 (EDA)​

2단계: 전처리​

3단계: 분해 및 정상성 검정​

4단계: 특성 엔지니어링​

5단계: 시계열 교차 검증​

6단계: FLAML 모델 학습​

7단계: 평가 및 시각화​

핵심 개념 체크리스트​

방법론 선택 가이드​

프로젝트 응용​

종합 파이프라인 코드​

정리​

다음 파트 예고​

개요