079 시계열 실전 - 판매량 예측

키워드: 판매량 예측, sales forecasting

개요

판매량 예측은 기업의 재고 관리, 인력 계획, 마케팅 전략에 핵심적인 역할을 합니다. 이 글에서는 PyCaret을 사용하여 실제 판매량 예측 프로젝트를 수행합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pycaret[full]>=3.0

프로젝트 개요

목표: 30일 후 일별 판매량 예측

비즈니스 요구사항:
- 재고 발주 계획 수립 (2주 리드타임)
- 프로모션 효과 예측
- 인력 스케줄링

평가 기준:
- MAPE < 15%
- 주간 총 판매량 오차 < 10%

데이터 생성 (시뮬레이션)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

# 3년간 일별 데이터
dates = pd.date_range('2021-01-01', periods=365*3, freq='D')

# 079 기본 판매량
base = 1000

# 079 추세 (성장)
trend = np.linspace(0, 300, len(dates))

# 079 연간 계절성 (여름 성수기)
yearly_seasonality = 150 * np.sin(2 * np.pi * (np.arange(len(dates)) - 60) / 365)

# 079 주간 패턴 (주말 증가)
dayofweek = np.array([d.dayofweek for d in dates])
weekly_pattern = np.where(dayofweek >= 5, 100, 0)  # 주말 증가

# 079 월초/월말 효과
day = np.array([d.day for d in dates])
payday_effect = np.where((day >= 1) & (day <= 5), 50, 0)  # 월초 증가
monthend_effect = np.where(day >= 25, 30, 0)  # 월말 증가

# 079 프로모션 효과
np.random.seed(42)
promotion = np.random.binomial(1, 0.1, len(dates))  # 10% 확률
promo_effect = promotion * np.random.uniform(100, 300, len(dates))

# 079 휴일 효과 (간단화)
holiday_dates = ['2021-01-01', '2021-05-05', '2021-08-15', '2021-12-25',
                 '2022-01-01', '2022-05-05', '2022-08-15', '2022-12-25',
                 '2023-01-01', '2023-05-05', '2023-08-15', '2023-12-25']
holiday = np.array([str(d.date()) in holiday_dates for d in dates]).astype(int)
holiday_effect = holiday * 200

# 079 노이즈
noise = np.random.normal(0, 80, len(dates))

# 079 최종 판매량
sales = base + trend + yearly_seasonality + weekly_pattern + payday_effect + monthend_effect + promo_effect + holiday_effect + noise
sales = np.maximum(sales, 0)  # 음수 방지

# 079 데이터프레임
data = pd.DataFrame({
    'date': dates,
    'sales': sales,
    'promotion': promotion,
    'holiday': holiday
})
data.set_index('date', inplace=True)

print(f"데이터 기간: {data.index.min()} ~ {data.index.max()}")
print(f"평균 판매량: {data['sales'].mean():.0f}")
print(f"판매량 범위: {data['sales'].min():.0f} ~ {data['sales'].max():.0f}")

탐색적 데이터 분석

import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. 전체 시계열
axes[0, 0].plot(data.index, data['sales'], alpha=0.7)
axes[0, 0].set_title('Daily Sales (3 Years)')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Sales')

# 2. 월별 평균
monthly = data['sales'].resample('MS').mean()
axes[0, 1].plot(monthly.index, monthly.values, 'o-')
axes[0, 1].set_title('Monthly Average Sales')
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Average Sales')

# 3. 요일별 분포
daily_avg = data.groupby(data.index.dayofweek)['sales'].mean()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[1, 0].bar(day_names, daily_avg.values)
axes[1, 0].set_title('Average Sales by Day of Week')
axes[1, 0].set_xlabel('Day')
axes[1, 0].set_ylabel('Average Sales')

# 4. 월별 분포
monthly_avg = data.groupby(data.index.month)['sales'].mean()
axes[1, 1].bar(range(1, 13), monthly_avg.values)
axes[1, 1].set_title('Average Sales by Month')
axes[1, 1].set_xlabel('Month')
axes[1, 1].set_ylabel('Average Sales')

plt.tight_layout()
plt.savefig('sales_eda.png', dpi=150)

특성 엔지니어링

import pandas as pd
import numpy as np

def create_sales_features(df):
    """판매량 예측용 특성 생성"""
    df = df.copy()

    # 시간 특성
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['dayofyear'] = df.index.dayofyear
    df['weekofyear'] = df.index.isocalendar().week.astype(int)
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

    # 순환 인코딩
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)

    # 월초/월말
    df['is_monthstart'] = (df['day'] <= 5).astype(int)
    df['is_monthend'] = (df['day'] >= 25).astype(int)

    # Lag 특성
    for lag in [1, 7, 14, 28, 365]:
        df[f'sales_lag_{lag}'] = df['sales'].shift(lag)

    # 이동 평균
    for window in [7, 14, 28]:
        df[f'sales_rolling_mean_{window}'] = df['sales'].shift(1).rolling(window).mean()
        df[f'sales_rolling_std_{window}'] = df['sales'].shift(1).rolling(window).std()

    # 지수 가중 이동 평균
    for span in [7, 14, 28]:
        df[f'sales_ewm_{span}'] = df['sales'].shift(1).ewm(span=span).mean()

    # 차분
    df['sales_diff_1'] = df['sales'].diff(1)
    df['sales_diff_7'] = df['sales'].diff(7)

    # 작년 동기 대비
    df['sales_yoy_ratio'] = df['sales'] / df['sales'].shift(365).replace(0, np.nan)

    return df

# 079 적용
data_features = create_sales_features(data)
print(f"생성된 특성 수: {data_features.shape[1]}")

PyCaret 모델링

from pycaret.time_series import *

# 079 외생 변수
exog_features = ['promotion', 'holiday', 'is_weekend', 'is_monthstart', 'is_monthend']

# 079 환경 설정
ts = setup(
    data=data,
    target='sales',
    fh=30,  # 30일 예측
    fold=5,
    seasonal_period=7,  # 주간 패턴
    exogenous_features=exog_features,
    session_id=42,
    verbose=False
)

# 079 모델 비교
print("=== 모델 비교 ===")
best_models = compare_models(n_select=5)

상위 모델 분석

from pycaret.time_series import *
import matplotlib.pyplot as plt

# 079 상위 모델들
models = {
    'Auto ARIMA': create_model('auto_arima'),
    'ETS': create_model('ets'),
    'Prophet': create_model('prophet'),
    'LightGBM': create_model('lightgbm_cds_dt')
}

# 079 예측 결과 비교
plt.figure(figsize=(15, 8))
plt.plot(data.index[-90:], data['sales'].values[-90:], 'k-', label='Actual', linewidth=2)

future_dates = pd.date_range(data.index[-1] + pd.Timedelta(days=1), periods=30, freq='D')
colors = ['blue', 'red', 'green', 'orange']

for (name, model), color in zip(models.items(), colors):
    pred = predict_model(model)
    plt.plot(future_dates, pred['y_pred'].values, '--', color=color, label=name, linewidth=1.5)

plt.axvline(x=data.index[-1], color='gray', linestyle=':', label='Forecast Start')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.title('30-Day Sales Forecast Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('sales_forecast_comparison.png', dpi=150)

최종 모델 선택 및 튜닝

from pycaret.time_series import *

# 079 최적 모델 선택 (예: LightGBM)
best_model = create_model('lightgbm_cds_dt')

# 079 튜닝
tuned_model = tune_model(best_model)

# 079 진단
plot_model(tuned_model, plot='forecast')
plot_model(tuned_model, plot='diagnostics')

예측 결과 분석

from pycaret.time_series import *
import pandas as pd

# 079 최종 예측
final_model = finalize_model(tuned_model)
predictions = predict_model(final_model, fh=30)

# 079 결과 데이터프레임
forecast_df = pd.DataFrame({
    'date': pd.date_range(data.index[-1] + pd.Timedelta(days=1), periods=30, freq='D'),
    'predicted_sales': predictions['y_pred'].values
})
forecast_df.set_index('date', inplace=True)

# 079 주간 집계
weekly_forecast = forecast_df.resample('W').sum()
print("\n주간 예측:")
print(weekly_forecast.round(0))

# 079 월간 집계
monthly_forecast = forecast_df.resample('MS').sum()
print("\n월간 예측:")
print(monthly_forecast.round(0))

재고 계획 연동

import pandas as pd
import numpy as np

def create_inventory_plan(forecast_df, lead_time=14, safety_stock_days=7):
    """재고 계획 생성"""

    # 일별 예측
    daily_forecast = forecast_df['predicted_sales'].values

    # 리드타임 동안 필요 재고
    lead_time_demand = np.sum(daily_forecast[:lead_time])

    # 안전 재고 (평균 * 안전재고일수)
    avg_daily = np.mean(daily_forecast)
    safety_stock = avg_daily * safety_stock_days

    # 발주량
    reorder_quantity = lead_time_demand + safety_stock

    plan = {
        'lead_time_demand': lead_time_demand,
        'safety_stock': safety_stock,
        'reorder_quantity': reorder_quantity,
        'avg_daily_sales': avg_daily,
        'total_30day_forecast': np.sum(daily_forecast)
    }

    return plan

# 079 재고 계획 생성
inventory_plan = create_inventory_plan(forecast_df)

print("\n=== 재고 계획 ===")
print(f"리드타임(14일) 예상 수요: {inventory_plan['lead_time_demand']:,.0f}")
print(f"안전 재고(7일분): {inventory_plan['safety_stock']:,.0f}")
print(f"권장 발주량: {inventory_plan['reorder_quantity']:,.0f}")
print(f"일평균 예상 판매량: {inventory_plan['avg_daily_sales']:,.0f}")
print(f"30일 총 예상 판매량: {inventory_plan['total_30day_forecast']:,.0f}")

프로모션 효과 시뮬레이션

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 079 프로모션 시나리오 비교
# 079 실제 구현에서는 외생 변수를 조정하여 예측

# 079 시나리오 1: 프로모션 없음
scenario_no_promo = forecast_df['predicted_sales'].sum()

# 079 시나리오 2: 주말 프로모션 (예상 20% 증가)
weekend_mask = forecast_df.index.dayofweek >= 5
promo_uplift = np.where(weekend_mask, 0.2, 0)
scenario_weekend_promo = forecast_df['predicted_sales'].values * (1 + promo_uplift)

# 079 시나리오 3: 전체 프로모션 (예상 15% 증가)
scenario_full_promo = forecast_df['predicted_sales'].values * 1.15

print("\n=== 프로모션 시나리오 비교 ===")
print(f"시나리오 1 (프로모션 없음): {scenario_no_promo:,.0f}")
print(f"시나리오 2 (주말 프로모션): {scenario_weekend_promo.sum():,.0f} (+{(scenario_weekend_promo.sum()/scenario_no_promo - 1)*100:.1f}%)")
print(f"시나리오 3 (전체 프로모션): {scenario_full_promo.sum():,.0f} (+{(scenario_full_promo.sum()/scenario_no_promo - 1)*100:.1f}%)")

예측 정확도 모니터링

import pandas as pd
import numpy as np

def calculate_forecast_accuracy(actual, predicted):
    """예측 정확도 계산"""

    mae = np.mean(np.abs(actual - predicted))
    rmse = np.sqrt(np.mean((actual - predicted) ** 2))
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    bias = np.mean(predicted - actual)  # 양수: 과대예측

    return {
        'MAE': mae,
        'RMSE': rmse,
        'MAPE': mape,
        'Bias': bias
    }

# 079 백테스트 (과거 데이터로 검증)
# 079 실제 운영에서는 실제 판매 데이터와 비교
print("\n=== 예측 정확도 목표 ===")
print("MAPE < 15%: 일별 예측")
print("MAPE < 10%: 주간 집계")
print("MAPE < 5%: 월간 집계")

대시보드용 출력

import matplotlib.pyplot as plt
import pandas as pd

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. 30일 예측
axes[0, 0].plot(forecast_df.index, forecast_df['predicted_sales'], 'b-', linewidth=2)
axes[0, 0].fill_between(forecast_df.index,
                        forecast_df['predicted_sales'] * 0.85,
                        forecast_df['predicted_sales'] * 1.15,
                        alpha=0.3, label='±15% Range')
axes[0, 0].set_title('30-Day Sales Forecast')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Predicted Sales')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. 요일별 예측 평균
daily_avg = forecast_df.groupby(forecast_df.index.dayofweek)['predicted_sales'].mean()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[0, 1].bar(day_names, daily_avg.values, color='steelblue')
axes[0, 1].set_title('Predicted Average by Day of Week')
axes[0, 1].set_ylabel('Average Sales')

# 3. 주간 집계
weekly = forecast_df.resample('W').sum()
axes[1, 0].bar(range(len(weekly)), weekly['predicted_sales'].values, color='coral')
axes[1, 0].set_title('Weekly Sales Forecast')
axes[1, 0].set_xlabel('Week')
axes[1, 0].set_ylabel('Total Sales')
axes[1, 0].set_xticks(range(len(weekly)))
axes[1, 0].set_xticklabels([f'W{i+1}' for i in range(len(weekly))])

# 4. 누적 판매량
cumsum = forecast_df['predicted_sales'].cumsum()
axes[1, 1].plot(forecast_df.index, cumsum, 'g-', linewidth=2)
axes[1, 1].set_title('Cumulative Sales Forecast')
axes[1, 1].set_xlabel('Date')
axes[1, 1].set_ylabel('Cumulative Sales')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('sales_dashboard.png', dpi=150)

# 079 요약 통계
print("\n=== 30일 예측 요약 ===")
print(f"총 예상 판매량: {forecast_df['predicted_sales'].sum():,.0f}")
print(f"일 평균: {forecast_df['predicted_sales'].mean():,.0f}")
print(f"최소: {forecast_df['predicted_sales'].min():,.0f}")
print(f"최대: {forecast_df['predicted_sales'].max():,.0f}")

모델 저장

from pycaret.time_series import *

# 079 모델 저장
save_model(final_model, 'sales_forecast_model')

# 079 나중에 로드
# 079 loaded_model = load_model('sales_forecast_model')
# 079 new_predictions = predict_model(loaded_model)

정리

판매량 예측: 비즈니스 핵심 역량
특성 엔지니어링: 시간, Lag, Rolling 특성
외생 변수: 프로모션, 휴일
재고/인력 계획 연동
정기적 모니터링 및 재학습

다음 글 예고

다음 글에서는 시계열 실전 - 주가 예측을 다룹니다.

PyCaret 머신러닝 마스터 시리즈 #079

개요​

실습 환경​

프로젝트 개요​

데이터 생성 (시뮬레이션)​

탐색적 데이터 분석​

특성 엔지니어링​

PyCaret 모델링​

상위 모델 분석​

최종 모델 선택 및 튜닝​

예측 결과 분석​

재고 계획 연동​

프로모션 효과 시뮬레이션​

예측 정확도 모니터링​

대시보드용 출력​

모델 저장​

정리​

다음 글 예고​

개요

실습 환경

프로젝트 개요

데이터 생성 (시뮬레이션)

탐색적 데이터 분석

특성 엔지니어링

PyCaret 모델링

상위 모델 분석

최종 모델 선택 및 튜닝

예측 결과 분석

재고 계획 연동

프로모션 효과 시뮬레이션

예측 정확도 모니터링

대시보드용 출력

모델 저장

정리

다음 글 예고