056 FLAML로 시계열 예측

키워드: FLAML, 시계열, 특성 엔지니어링

개요

FLAML은 시계열을 회귀 문제로 변환하여 예측합니다. 적절한 특성 엔지니어링을 통해 Lag 특성, 이동 평균, 날짜 특성 등을 생성하고 AutoML로 최적 모델을 찾습니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], pandas, scikit-learn

pip install flaml[automl] pandas scikit-learn matplotlib

시계열 데이터 준비

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from flaml import AutoML
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 056 시계열 데이터 생성
np.random.seed(42)
n_days = 730  # 2년

dates = pd.date_range('2022-01-01', periods=n_days, freq='D')

# 056 트렌드 + 주간 패턴 + 연간 계절성 + 노이즈
trend = np.linspace(100, 200, n_days)
weekly = 15 * np.sin(np.arange(n_days) * 2 * np.pi / 7)
yearly = 30 * np.sin(np.arange(n_days) * 2 * np.pi / 365)
noise = np.random.randn(n_days) * 10

values = trend + weekly + yearly + noise

df = pd.DataFrame({
    'date': dates,
    'value': values
})

print("시계열 데이터:")
print(df.head(10))

# 056 시각화
plt.figure(figsize=(14, 6))
plt.plot(df['date'], df['value'])
plt.title('Time Series Data')
plt.xlabel('Date')
plt.ylabel('Value')
plt.show()

특성 엔지니어링

날짜 특성

def create_date_features(df, date_col='date'):
    """날짜 기반 특성 생성"""
    df = df.copy()

    # 기본 날짜 특성
    df['year'] = df[date_col].dt.year
    df['month'] = df[date_col].dt.month
    df['day'] = df[date_col].dt.day
    df['dayofweek'] = df[date_col].dt.dayofweek
    df['dayofyear'] = df[date_col].dt.dayofyear
    df['weekofyear'] = df[date_col].dt.isocalendar().week.astype(int)
    df['quarter'] = df[date_col].dt.quarter

    # 주기적 특성 (사인/코사인 인코딩)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['day_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)

    # 이진 특성
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_month_start'] = df[date_col].dt.is_month_start.astype(int)
    df['is_month_end'] = df[date_col].dt.is_month_end.astype(int)

    return df

df_features = create_date_features(df)
print("\n날짜 특성:")
print(df_features[['date', 'year', 'month', 'dayofweek', 'is_weekend']].head())

Lag 특성

def create_lag_features(df, target_col='value', lags=[1, 7, 14, 30]):
    """Lag 특성 생성"""
    df = df.copy()

    for lag in lags:
        df[f'lag_{lag}'] = df[target_col].shift(lag)

    return df

df_features = create_lag_features(df_features)
print("\nLag 특성:")
print(df_features[['date', 'value', 'lag_1', 'lag_7', 'lag_30']].head(35))

이동 통계 특성

def create_rolling_features(df, target_col='value', windows=[7, 14, 30]):
    """이동 통계 특성 생성"""
    df = df.copy()

    for window in windows:
        # 이동 평균 (과거 데이터만 사용)
        df[f'rolling_mean_{window}'] = df[target_col].shift(1).rolling(window=window).mean()
        df[f'rolling_std_{window}'] = df[target_col].shift(1).rolling(window=window).std()
        df[f'rolling_min_{window}'] = df[target_col].shift(1).rolling(window=window).min()
        df[f'rolling_max_{window}'] = df[target_col].shift(1).rolling(window=window).max()

    return df

df_features = create_rolling_features(df_features)
print("\n이동 통계 특성:")
print(df_features[['date', 'value', 'rolling_mean_7', 'rolling_std_7']].iloc[30:35])

전체 특성 파이프라인

def create_all_features(df, target_col='value', date_col='date'):
    """전체 특성 생성 파이프라인"""
    df = df.copy()

    # 날짜 특성
    df = create_date_features(df, date_col)

    # Lag 특성
    df = create_lag_features(df, target_col, lags=[1, 2, 3, 7, 14, 21, 30])

    # 이동 통계
    df = create_rolling_features(df, target_col, windows=[7, 14, 30])

    # 결측치 제거 (초기 데이터)
    df = df.dropna()

    return df

df_final = create_all_features(df)
print(f"\n최종 데이터: {df_final.shape}")
print(f"특성 수: {df_final.shape[1] - 2}")  # date, value 제외

FLAML 학습

데이터 분할

# 056 시간 기반 분할
train_size = int(len(df_final) * 0.8)

df_train = df_final.iloc[:train_size]
df_test = df_final.iloc[train_size:]

# 056 특성과 타겟 분리
feature_cols = [col for col in df_final.columns if col not in ['date', 'value']]
X_train = df_train[feature_cols]
y_train = df_train['value']
X_test = df_test[feature_cols]
y_test = df_test['value']
dates_test = df_test['date']

print(f"학습 데이터: {X_train.shape}")
print(f"테스트 데이터: {X_test.shape}")
print(f"테스트 기간: {dates_test.min().strftime('%Y-%m-%d')} ~ {dates_test.max().strftime('%Y-%m-%d')}")

FLAML AutoML 학습

# 056 FLAML 설정
automl = AutoML()

automl.fit(
    X_train, y_train,
    task="regression",
    time_budget=120,
    metric="mae",
    split_type="time",  # 시계열 분할
    n_splits=5,
    seed=42,
    verbose=1
)

print(f"\n최적 모델: {automl.best_estimator}")
print(f"검증 MAE: {automl.best_loss:.4f}")

예측 및 평가

# 056 예측
y_pred = automl.predict(X_test)

# 056 평가
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("\n테스트 성능:")
print(f"  MAE: {mae:.4f}")
print(f"  RMSE: {rmse:.4f}")
print(f"  R²: {r2:.4f}")
print(f"  MAPE: {mape:.2f}%")

# 056 시각화
plt.figure(figsize=(14, 6))
plt.plot(dates_test, y_test.values, label='Actual', linewidth=2)
plt.plot(dates_test, y_pred, label='Predicted', linewidth=2, alpha=0.8)
plt.fill_between(dates_test, y_pred * 0.95, y_pred * 1.05, alpha=0.2)
plt.title(f'FLAML Time Series Prediction (MAE: {mae:.2f})')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.show()

특성 중요도

# 056 특성 중요도
if hasattr(automl.best_model, 'feature_importances_'):
    importance = automl.best_model.feature_importances_
    feature_names = X_train.columns

    # 상위 15개
    sorted_idx = np.argsort(importance)[::-1][:15]

    plt.figure(figsize=(10, 8))
    plt.barh(range(len(sorted_idx)), importance[sorted_idx][::-1])
    plt.yticks(range(len(sorted_idx)), feature_names[sorted_idx][::-1])
    plt.xlabel('Importance')
    plt.title('Top 15 Feature Importance')
    plt.tight_layout()
    plt.show()

    print("\n상위 5개 중요 특성:")
    for i in sorted_idx[:5]:
        print(f"  {feature_names[i]}: {importance[i]:.4f}")

다단계 예측 (Multi-step Forecasting)

재귀적 예측

def recursive_forecast(model, df, feature_cols, target_col, steps):
    """재귀적 다단계 예측"""
    df_forecast = df.copy()
    predictions = []

    for step in range(steps):
        # 마지막 행의 특성으로 예측
        X_last = df_forecast[feature_cols].iloc[[-1]]
        pred = model.predict(X_last)[0]
        predictions.append(pred)

        # 새 행 추가 (실제로는 더 정교한 특성 업데이트 필요)
        last_date = df_forecast['date'].iloc[-1]
        new_date = last_date + pd.Timedelta(days=1)

        new_row = {
            'date': new_date,
            target_col: pred,  # 예측값을 실제값으로 사용
        }

        # Lag 업데이트
        for lag in [1, 2, 3, 7, 14, 21, 30]:
            if lag <= len(df_forecast):
                new_row[f'lag_{lag}'] = df_forecast[target_col].iloc[-lag]

        # 날짜 특성
        new_row['dayofweek'] = new_date.dayofweek
        new_row['month'] = new_date.month
        new_row['is_weekend'] = 1 if new_date.dayofweek >= 5 else 0

        # 이동 평균 업데이트 (간략화)
        for window in [7, 14, 30]:
            recent_values = list(df_forecast[target_col].iloc[-window+1:]) + [pred]
            new_row[f'rolling_mean_{window}'] = np.mean(recent_values[-window:])

        # 누락된 특성 처리
        for col in feature_cols:
            if col not in new_row:
                new_row[col] = df_forecast[col].iloc[-1]

        df_forecast = pd.concat([df_forecast, pd.DataFrame([new_row])], ignore_index=True)

    return predictions

# 30일 예측
future_predictions = recursive_forecast(automl, df_train, feature_cols, 'value', 30)

# 056 시각화
future_dates = pd.date_range(
    start=df_train['date'].iloc[-1] + pd.Timedelta(days=1),
    periods=30
)

plt.figure(figsize=(14, 6))
plt.plot(df_train['date'].iloc[-60:], df_train['value'].iloc[-60:], label='Historical')
plt.plot(df_test['date'][:30], y_test.values[:30], 'g-', label='Actual', linewidth=2)
plt.plot(future_dates, future_predictions, 'r--', label='Forecast', linewidth=2)
plt.axvline(x=df_train['date'].iloc[-1], color='gray', linestyle='--', alpha=0.5)
plt.title('Recursive Multi-step Forecast')
plt.legend()
plt.show()

완전한 시계열 파이프라인

class FLAMLTimeSeriesForecaster:
    """FLAML 기반 시계열 예측기"""

    def __init__(self, time_budget=120):
        self.time_budget = time_budget
        self.automl = None
        self.feature_cols = None

    def create_features(self, df, target_col='value'):
        """특성 생성"""
        return create_all_features(df, target_col)

    def fit(self, df, target_col='value'):
        """학습"""
        # 특성 생성
        df_features = self.create_features(df, target_col)

        # 특성 컬럼
        self.feature_cols = [col for col in df_features.columns
                            if col not in ['date', target_col]]

        X = df_features[self.feature_cols]
        y = df_features[target_col]

        # FLAML 학습
        self.automl = AutoML()
        self.automl.fit(
            X, y,
            task="regression",
            time_budget=self.time_budget,
            metric="mae",
            split_type="time",
            verbose=0
        )

        return self

    def predict(self, df, target_col='value'):
        """예측"""
        df_features = self.create_features(df, target_col)
        X = df_features[self.feature_cols]
        return self.automl.predict(X)

    def score(self, df, target_col='value'):
        """평가"""
        df_features = self.create_features(df, target_col)
        X = df_features[self.feature_cols]
        y = df_features[target_col]
        y_pred = self.automl.predict(X)

        return {
            'mae': mean_absolute_error(y, y_pred),
            'rmse': np.sqrt(mean_squared_error(y, y_pred)),
            'r2': r2_score(y, y_pred)
        }

# 056 사용
forecaster = FLAMLTimeSeriesForecaster(time_budget=60)
forecaster.fit(df_train)

metrics = forecaster.score(df_test)
print("\n파이프라인 성능:")
for name, value in metrics.items():
    print(f"  {name}: {value:.4f}")

정리

FLAML은 시계열을 회귀 문제로 변환
특성 엔지니어링이 핵심: Lag, 이동 평균, 날짜 특성
split_type="time"으로 시간 기반 교차 검증
다단계 예측은 재귀적 방식으로 구현
Lag 특성 사용 시 데이터 누출 주의

다음 글 예고

다음 글에서는 Prophet 기초에 대해 알아보겠습니다. Facebook이 개발한 시계열 예측 라이브러리를 소개합니다.

FLAML AutoML 마스터 시리즈 #056

개요​

실습 환경​

시계열 데이터 준비​

특성 엔지니어링​

날짜 특성​

Lag 특성​

이동 통계 특성​

전체 특성 파이프라인​

FLAML 학습​

데이터 분할​

FLAML AutoML 학습​

예측 및 평가​

특성 중요도​

다단계 예측 (Multi-step Forecasting)​

재귀적 예측​

완전한 시계열 파이프라인​

정리​

다음 글 예고​

개요

실습 환경

시계열 데이터 준비

특성 엔지니어링

날짜 특성

Lag 특성

이동 통계 특성

전체 특성 파이프라인

FLAML 학습

데이터 분할

FLAML AutoML 학습

예측 및 평가

특성 중요도

다단계 예측 (Multi-step Forecasting)

재귀적 예측

완전한 시계열 파이프라인

정리

다음 글 예고