051 시계열 데이터란?

키워드: 시계열, time series, 순차 데이터

개요

시계열(Time Series) 데이터는 시간 순서에 따라 수집된 데이터입니다. 주가, 날씨, 매출, 센서 데이터 등 많은 실제 데이터가 시계열 형태입니다. 이 글에서는 시계열 데이터의 기본 개념과 특성을 알아봅니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pandas, numpy, matplotlib

pip install pandas numpy matplotlib statsmodels

시계열 데이터의 정의

기본 개념

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 051 시계열 데이터 예시
dates = pd.date_range(start='2023-01-01', periods=365, freq='D')
np.random.seed(42)

# 051 트렌드 + 계절성 + 노이즈
trend = np.linspace(100, 150, 365)
seasonality = 20 * np.sin(np.arange(365) * 2 * np.pi / 365)
noise = np.random.randn(365) * 5

values = trend + seasonality + noise

ts = pd.Series(values, index=dates, name='value')

print("시계열 데이터 예시:")
print(ts.head(10))
print(f"\n데이터 기간: {ts.index.min()} ~ {ts.index.max()}")
print(f"데이터 포인트 수: {len(ts)}")

시계열 vs 일반 데이터

comparison = {
    '특성': ['순서', '독립성', '예측 방향', '분할 방법'],
    '일반 데이터': [
        '순서 무관',
        '샘플 독립',
        '특성 → 타겟',
        '랜덤 분할'
    ],
    '시계열 데이터': [
        '시간 순서 중요',
        '연속 샘플 의존적',
        '과거 → 미래',
        '시간 기반 분할'
    ]
}

print("\n시계열 vs 일반 데이터:")
print(pd.DataFrame(comparison).to_string(index=False))

시계열 데이터의 구성요소

4가지 구성요소

# 051 구성요소 분리 시각화
fig, axes = plt.subplots(4, 1, figsize=(14, 12))

# 1. 원본 데이터
axes[0].plot(dates, values, 'b-', alpha=0.7)
axes[0].set_title('Original Time Series')
axes[0].set_ylabel('Value')

# 2. 트렌드 (Trend)
axes[1].plot(dates, trend, 'g-', linewidth=2)
axes[1].set_title('Trend Component')
axes[1].set_ylabel('Value')

# 3. 계절성 (Seasonality)
axes[2].plot(dates, seasonality, 'orange', linewidth=2)
axes[2].set_title('Seasonal Component')
axes[2].set_ylabel('Value')

# 4. 잔차/노이즈 (Residual)
axes[3].plot(dates, noise, 'r-', alpha=0.7)
axes[3].set_title('Residual (Noise)')
axes[3].set_ylabel('Value')
axes[3].set_xlabel('Date')

plt.tight_layout()
plt.show()

# 051 구성요소 설명
components = {
    '구성요소': ['트렌드(Trend)', '계절성(Seasonality)', '주기(Cycle)', '잔차(Residual)'],
    '설명': [
        '장기적인 증가/감소 패턴',
        '일정 주기로 반복되는 패턴',
        '비고정 주기의 변동',
        '설명되지 않는 무작위 변동'
    ],
    '예시': [
        '경제 성장, 인구 증가',
        '월별 매출, 요일별 트래픽',
        '경기 순환',
        '무작위 이벤트'
    ]
}

print("\n시계열 구성요소:")
print(pd.DataFrame(components).to_string(index=False))

시계열 데이터 유형

단변량 vs 다변량

# 051 단변량 시계열 (Univariate)
univariate_ts = pd.DataFrame({
    'date': dates,
    'sales': values
})

print("단변량 시계열 (1개 변수):")
print(univariate_ts.head())

# 051 다변량 시계열 (Multivariate)
multivariate_ts = pd.DataFrame({
    'date': dates,
    'sales': values,
    'temperature': 20 + 10 * np.sin(np.arange(365) * 2 * np.pi / 365) + np.random.randn(365) * 2,
    'advertising': np.random.uniform(1000, 5000, 365)
})

print("\n다변량 시계열 (여러 변수):")
print(multivariate_ts.head())

등간격 vs 불규칙 간격

# 051 등간격 (Regular)
regular_ts = pd.Series(
    np.random.randn(10),
    index=pd.date_range('2023-01-01', periods=10, freq='D')
)
print("\n등간격 시계열 (매일):")
print(regular_ts)

# 051 불규칙 간격 (Irregular)
irregular_dates = ['2023-01-01', '2023-01-03', '2023-01-07', '2023-01-08', '2023-01-15']
irregular_ts = pd.Series(
    np.random.randn(5),
    index=pd.to_datetime(irregular_dates)
)
print("\n불규칙 간격 시계열:")
print(irregular_ts)

pandas로 시계열 다루기

DatetimeIndex 생성

# 051 다양한 빈도의 DatetimeIndex
frequencies = {
    'D': '일별',
    'W': '주별',
    'M': '월별',
    'Q': '분기별',
    'H': '시간별',
    'T': '분별'
}

print("다양한 시계열 빈도:")
for freq, name in frequencies.items():
    idx = pd.date_range('2023-01-01', periods=5, freq=freq)
    print(f"\n{name} ({freq}):")
    print(f"  {list(idx.strftime('%Y-%m-%d %H:%M'))}")

시계열 인덱싱

# 051 시계열 생성
ts_full = pd.Series(
    np.random.randn(365),
    index=pd.date_range('2023-01-01', periods=365, freq='D')
)

# 051 다양한 인덱싱 방법
print("시계열 인덱싱:")

# 051 특정 날짜
print(f"\n2023-01-15: {ts_full['2023-01-15']:.4f}")

# 051 기간 슬라이싱
print(f"\n2023년 1월 (처음 5개):")
print(ts_full['2023-01'].head())

# 051 범위
print(f"\n2023-03-01 ~ 2023-03-05:")
print(ts_full['2023-03-01':'2023-03-05'])

리샘플링

# 051 일별 데이터를 다른 빈도로 변환
daily_data = pd.Series(
    np.random.randn(365) + 100,
    index=pd.date_range('2023-01-01', periods=365, freq='D')
)

# 051 주별 평균
weekly = daily_data.resample('W').mean()

# 051 월별 합계
monthly = daily_data.resample('M').sum()

print("리샘플링 예시:")
print(f"\n일별 → 주별 (평균):")
print(weekly.head())
print(f"\n일별 → 월별 (합계):")
print(monthly.head())

시계열 시각화

기본 시각화

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. 라인 플롯
axes[0, 0].plot(ts.index, ts.values)
axes[0, 0].set_title('Line Plot')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Value')

# 2. 이동 평균
window = 30
rolling_mean = ts.rolling(window=window).mean()
axes[0, 1].plot(ts.index, ts.values, alpha=0.5, label='Original')
axes[0, 1].plot(ts.index, rolling_mean, 'r-', linewidth=2, label=f'{window}-day MA')
axes[0, 1].set_title('Moving Average')
axes[0, 1].legend()

# 3. 월별 박스플롯
monthly_data = pd.DataFrame({'value': ts.values, 'month': ts.index.month})
monthly_data.boxplot(column='value', by='month', ax=axes[1, 0])
axes[1, 0].set_title('Monthly Distribution')
axes[1, 0].set_xlabel('Month')
plt.suptitle('')

# 4. 자기상관 (간단히)
lags = range(1, 31)
autocorr = [ts.autocorr(lag=lag) for lag in lags]
axes[1, 1].bar(lags, autocorr)
axes[1, 1].set_title('Autocorrelation')
axes[1, 1].set_xlabel('Lag')
axes[1, 1].set_ylabel('Correlation')

plt.tight_layout()
plt.show()

시계열 예측의 과제

challenges = {
    '과제': ['데이터 누출', '비정상성', '계절성 변화', '이상치', '결측치'],
    '설명': [
        '미래 정보가 학습에 포함됨',
        '평균/분산이 시간에 따라 변함',
        '계절 패턴이 변할 수 있음',
        '급격한 이벤트로 인한 이상값',
        '일부 시점의 데이터 누락'
    ],
    '해결책': [
        '시간 기반 분할, Lag 특성',
        '차분, 로그 변환',
        '적응형 모델',
        '탐지 및 처리',
        '보간법, 전방/후방 채움'
    ]
}

print("\n시계열 예측의 과제:")
print(pd.DataFrame(challenges).to_string(index=False))

FLAML에서의 시계열

# 051 FLAML은 시계열을 회귀 문제로 접근
# 051 특성 엔지니어링이 핵심

print("\nFLAML 시계열 접근법:")
print("  1. Lag 특성 생성 (t-1, t-7, t-30 등)")
print("  2. 이동 평균 특성")
print("  3. 날짜 특성 (요일, 월, 분기)")
print("  4. 시간 기반 분할 (split_type='time')")
print("  5. 회귀 모델로 예측")

정리

시계열 데이터: 시간 순서로 수집된 데이터
구성요소: 트렌드, 계절성, 주기, 잔차
특징: 시간 순서 중요, 연속 샘플 의존적
pandas: DatetimeIndex, 리샘플링, 슬라이싱
과제: 데이터 누출 방지, 비정상성 처리
FLAML은 특성 엔지니어링 + 회귀 방식으로 접근

다음 글 예고

다음 글에서는 시계열 데이터 전처리에 대해 알아보겠습니다. 결측치 처리, 이상치 탐지, 정규화 등을 다룹니다.

FLAML AutoML 마스터 시리즈 #051

개요​

실습 환경​

시계열 데이터의 정의​

기본 개념​

시계열 vs 일반 데이터​

시계열 데이터의 구성요소​

4가지 구성요소​

시계열 데이터 유형​

단변량 vs 다변량​

등간격 vs 불규칙 간격​

pandas로 시계열 다루기​

DatetimeIndex 생성​

시계열 인덱싱​

리샘플링​

시계열 시각화​

기본 시각화​

시계열 예측의 과제​

FLAML에서의 시계열​

정리​

다음 글 예고​

개요