078 시계열 특성 엔지니어링

키워드: 특성 엔지니어링, feature engineering

개요

시계열 특성 엔지니어링은 날짜/시간 정보와 과거 값을 활용하여 예측에 유용한 특성을 생성하는 과정입니다. ML 모델에서 특히 중요하며, 예측 성능을 크게 향상시킬 수 있습니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pycaret[full]>=3.0

데이터 준비

import pandas as pd
import numpy as np

# 078 시계열 데이터 생성
np.random.seed(42)
dates = pd.date_range('2020-01-01', periods=365*3, freq='D')

trend = np.linspace(100, 200, len(dates))
yearly = 30 * np.sin(2 * np.pi * np.arange(len(dates)) / 365)
weekly = 10 * np.sin(2 * np.pi * np.arange(len(dates)) / 7)
noise = np.random.normal(0, 10, len(dates))

values = trend + yearly + weekly + noise

data = pd.DataFrame({
    'date': dates,
    'sales': values
})
data.set_index('date', inplace=True)

print(f"데이터 기간: {data.index.min()} ~ {data.index.max()}")
print(data.head())

날짜/시간 특성

기본 시간 특성

import pandas as pd

def create_datetime_features(df, date_col=None):
    """날짜/시간 기반 특성 생성"""

    if date_col:
        df = df.copy()
        df['date'] = pd.to_datetime(df[date_col])
    else:
        df = df.copy()
        df['date'] = df.index

    # 기본 시간 특성
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek  # 0=월요일
    df['dayofyear'] = df['date'].dt.dayofyear
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
    df['quarter'] = df['date'].dt.quarter

    # 시간대 특성 (시간 데이터가 있는 경우)
    # df['hour'] = df['date'].dt.hour

    # 이진 특성
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['is_quarter_start'] = df['date'].dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df['date'].dt.is_quarter_end.astype(int)

    if date_col is None:
        df = df.drop('date', axis=1)

    return df

# 078 적용
data_features = create_datetime_features(data)
print(data_features.head())
print(f"\n생성된 특성: {data_features.columns.tolist()}")

순환 인코딩

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def create_cyclical_features(df, col, max_val):
    """순환 특성 생성 (sin/cos 인코딩)"""
    df[f'{col}_sin'] = np.sin(2 * np.pi * df[col] / max_val)
    df[f'{col}_cos'] = np.cos(2 * np.pi * df[col] / max_val)
    return df

# 078 적용
data_cyclic = data_features.copy()
data_cyclic = create_cyclical_features(data_cyclic, 'month', 12)
data_cyclic = create_cyclical_features(data_cyclic, 'dayofweek', 7)
data_cyclic = create_cyclical_features(data_cyclic, 'dayofyear', 365)

# 078 시각화: 월의 순환 인코딩
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 078 선형 인코딩
axes[0].scatter(data_cyclic['month'], data_cyclic['sales'], alpha=0.3)
axes[0].set_xlabel('Month (Linear)')
axes[0].set_ylabel('Sales')
axes[0].set_title('Linear Encoding')

# 078 순환 인코딩
scatter = axes[1].scatter(data_cyclic['month_sin'], data_cyclic['month_cos'],
                          c=data_cyclic['month'], cmap='hsv', alpha=0.3)
plt.colorbar(scatter, ax=axes[1], label='Month')
axes[1].set_xlabel('Month Sin')
axes[1].set_ylabel('Month Cos')
axes[1].set_title('Cyclical Encoding')

plt.tight_layout()
plt.savefig('cyclical_encoding.png', dpi=150)

순환 인코딩 필요성

문제: 선형 인코딩
- 12월(12) → 1월(1): 거리 = 11
- 실제 거리: 1 (이웃)

해결: 순환 인코딩
- sin/cos로 원형 표현
- 12월 → 1월: 연속적
- 계절성 포착에 효과적

Lag 특성 (지연 특성)

import pandas as pd

def create_lag_features(df, target_col, lags):
    """Lag(지연) 특성 생성"""
    df = df.copy()

    for lag in lags:
        df[f'{target_col}_lag_{lag}'] = df[target_col].shift(lag)

    return df

# 078 적용
lags = [1, 7, 14, 28, 30, 365]  # 1일, 1주, 2주, 4주, 1달, 1년 전
data_lag = create_lag_features(data_features, 'sales', lags)

print(data_lag[['sales', 'sales_lag_1', 'sales_lag_7', 'sales_lag_365']].head(10))

Lag 선택 가이드

데이터 주기에 따른 Lag:

일별 데이터:
- lag_1: 어제
- lag_7: 지난주 같은 요일
- lag_14: 2주 전
- lag_30/31: 지난달
- lag_365: 작년

주별 데이터:
- lag_1: 지난주
- lag_4: 4주 전
- lag_52: 작년

월별 데이터:
- lag_1: 지난달
- lag_12: 작년 같은 달

이동 평균/통계 특성

import pandas as pd
import numpy as np

def create_rolling_features(df, target_col, windows):
    """이동 통계량 특성 생성"""
    df = df.copy()

    for window in windows:
        # 이동 평균 (과거만 사용)
        df[f'{target_col}_rolling_mean_{window}'] = (
            df[target_col].shift(1).rolling(window=window, min_periods=1).mean()
        )

        # 이동 표준편차
        df[f'{target_col}_rolling_std_{window}'] = (
            df[target_col].shift(1).rolling(window=window, min_periods=1).std()
        )

        # 이동 최소/최대
        df[f'{target_col}_rolling_min_{window}'] = (
            df[target_col].shift(1).rolling(window=window, min_periods=1).min()
        )
        df[f'{target_col}_rolling_max_{window}'] = (
            df[target_col].shift(1).rolling(window=window, min_periods=1).max()
        )

    return df

# 078 적용
windows = [7, 14, 30, 90]
data_rolling = create_rolling_features(data_lag, 'sales', windows)

print(data_rolling[['sales', 'sales_rolling_mean_7', 'sales_rolling_mean_30']].head(10))

지수 가중 이동 평균 (EWMA)

import pandas as pd

def create_ewm_features(df, target_col, spans):
    """지수 가중 이동 평균 특성"""
    df = df.copy()

    for span in spans:
        # EWM (최근 값에 더 높은 가중치)
        df[f'{target_col}_ewm_{span}'] = (
            df[target_col].shift(1).ewm(span=span, min_periods=1).mean()
        )

    return df

# 078 적용
spans = [7, 14, 30]
data_ewm = create_ewm_features(data_rolling, 'sales', spans)

print(data_ewm[['sales', 'sales_ewm_7', 'sales_ewm_30']].tail())

차분 특성

import pandas as pd

def create_diff_features(df, target_col, periods):
    """차분 특성 생성"""
    df = df.copy()

    for period in periods:
        # 차분 (변화량)
        df[f'{target_col}_diff_{period}'] = df[target_col].diff(period)

        # 변화율
        df[f'{target_col}_pct_change_{period}'] = df[target_col].pct_change(period)

    return df

# 078 적용
periods = [1, 7, 30]
data_diff = create_diff_features(data_ewm, 'sales', periods)

print(data_diff[['sales', 'sales_diff_1', 'sales_diff_7', 'sales_pct_change_1']].head(10))

휴일 특성

import pandas as pd
import numpy as np

def create_holiday_features(df, country='US'):
    """휴일 특성 생성"""
    df = df.copy()

    # 간단한 휴일 예시 (실제로는 holidays 패키지 사용)
    korean_holidays = {
        '01-01': '신정',
        '03-01': '삼일절',
        '05-05': '어린이날',
        '08-15': '광복절',
        '10-03': '개천절',
        '12-25': '크리스마스'
    }

    dates = df.index if isinstance(df.index, pd.DatetimeIndex) else pd.to_datetime(df['date'])
    date_str = dates.strftime('%m-%d')

    df['is_holiday'] = date_str.isin(korean_holidays.keys()).astype(int)

    # 휴일 전후
    df['days_to_holiday'] = 0  # 간단화
    df['days_from_holiday'] = 0

    return df

# 078 적용
data_holiday = create_holiday_features(data_diff)
print(f"휴일 수: {data_holiday['is_holiday'].sum()}")

종합 특성 엔지니어링

import pandas as pd
import numpy as np

def full_feature_engineering(df, target_col):
    """종합 특성 엔지니어링"""
    df = df.copy()

    # 1. 날짜/시간 특성
    df = create_datetime_features(df)

    # 2. 순환 인코딩
    df = create_cyclical_features(df, 'month', 12)
    df = create_cyclical_features(df, 'dayofweek', 7)

    # 3. Lag 특성
    df = create_lag_features(df, target_col, [1, 7, 14, 28, 365])

    # 4. 이동 통계량
    df = create_rolling_features(df, target_col, [7, 14, 30])

    # 5. EWM
    df = create_ewm_features(df, target_col, [7, 14, 30])

    # 6. 차분
    df = create_diff_features(df, target_col, [1, 7])

    # 7. 휴일
    df = create_holiday_features(df)

    # NaN 처리
    df = df.fillna(method='bfill').fillna(method='ffill')

    return df

# 078 적용
data_full = full_feature_engineering(data, 'sales')
print(f"원본 특성 수: 1")
print(f"생성된 특성 수: {data_full.shape[1]}")
print(f"\n특성 목록: {data_full.columns.tolist()}")

PyCaret과 함께 사용

from pycaret.time_series import *

# 078 특성 엔지니어링 적용
data_engineered = full_feature_engineering(data, 'sales')

# 078 외생 변수로 사용
exog_features = [col for col in data_engineered.columns
                 if col not in ['sales', 'date']]

# 078 PyCaret 설정
ts = setup(
    data=data_engineered,
    target='sales',
    fh=30,
    fold=3,
    exogenous_features=exog_features[:10],  # 일부만 사용 (메모리 고려)
    session_id=42,
    verbose=False
)

# 078 모델 비교
best = compare_models(n_select=3)

특성 선택

from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import matplotlib.pyplot as plt

# 078 특성 중요도 계산
data_ml = data_full.dropna()
feature_cols = [col for col in data_ml.columns if col != 'sales']

X = data_ml[feature_cols]
y = data_ml['sales']

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

# 078 중요도 정렬
importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("상위 20개 특성:")
print(importance.head(20))

# 078 시각화
plt.figure(figsize=(12, 8))
top_20 = importance.head(20)
plt.barh(top_20['Feature'], top_20['Importance'])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance_ts.png', dpi=150)

주의사항

1. 데이터 누출 방지
   - shift(1) 필수: 현재/미래 정보 사용 금지
   - rolling도 과거만 사용

2. NaN 처리
   - Lag/Rolling 초기 NaN 발생
   - 적절한 대체 또는 제거

3. 다중공선성
   - 유사한 특성 간 높은 상관관계
   - VIF 체크 또는 특성 선택

4. 계산 비용
   - 너무 많은 특성은 비효율적
   - 중요 특성 위주로 선택

정리

시간 특성: year, month, day, dayofweek 등
순환 인코딩: sin/cos로 주기성 표현
Lag 특성: 과거 값 활용
Rolling 특성: 이동 평균, 표준편차 등
데이터 누출 방지 필수

다음 글 예고

다음 글에서는 시계열 실전 - 판매량 예측을 다룹니다.

PyCaret 머신러닝 마스터 시리즈 #078

개요​

실습 환경​

데이터 준비​

날짜/시간 특성​

기본 시간 특성​

순환 인코딩​

순환 인코딩 필요성​

Lag 특성 (지연 특성)​

Lag 선택 가이드​

이동 평균/통계 특성​

지수 가중 이동 평균 (EWMA)​

차분 특성​

휴일 특성​

종합 특성 엔지니어링​

PyCaret과 함께 사용​

특성 선택​

주의사항​

정리​

다음 글 예고​

개요

실습 환경

데이터 준비

날짜/시간 특성

기본 시간 특성

순환 인코딩

순환 인코딩 필요성

Lag 특성 (지연 특성)

Lag 선택 가이드

이동 평균/통계 특성

지수 가중 이동 평균 (EWMA)

차분 특성

휴일 특성

종합 특성 엔지니어링

PyCaret과 함께 사용

특성 선택

주의사항

정리

다음 글 예고