044 회귀 프로젝트 - 매출 예측

키워드: 매출 예측, 비즈니스, 회귀

개요

매출 예측은 비즈니스에서 매우 중요한 문제입니다. 과거 매출 데이터와 다양한 요인을 분석하여 미래 매출을 예측합니다. 이 글에서는 소매점 매출 예측 모델을 개발합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], pandas, scikit-learn

pip install flaml[automl] pandas scikit-learn matplotlib

프로젝트 개요

목표

다양한 요인을 고려한 일별/주별 매출 예측

비즈니스 가치

재고 관리 최적화
인력 배치 계획
마케팅 예산 책정
현금 흐름 관리

Step 1: 데이터 준비

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

# 044 가상의 소매점 매출 데이터 생성
np.random.seed(42)
n_days = 730  # 2년

# 044 날짜 범위
start_date = datetime(2022, 1, 1)
dates = [start_date + timedelta(days=i) for i in range(n_days)]

# 044 기본 매출 패턴
base_sales = 50000
trend = np.linspace(0, 10000, n_days)  # 성장 트렌드
seasonality = 10000 * np.sin(np.arange(n_days) * 2 * np.pi / 365)  # 연간 계절성
weekly = 5000 * np.sin(np.arange(n_days) * 2 * np.pi / 7)  # 주간 패턴

# 044 특별 이벤트
holiday_boost = np.zeros(n_days)
for i, date in enumerate(dates):
    if date.month == 12:  # 12월 크리스마스 시즌
        holiday_boost[i] = 15000
    if date.month == 11 and date.day >= 20:  # 블랙프라이데이
        holiday_boost[i] = 20000

# 044 날씨 영향 (랜덤)
weather_effect = np.random.randn(n_days) * 3000

# 044 프로모션
promotions = np.random.choice([0, 1], n_days, p=[0.9, 0.1])
promo_effect = promotions * np.random.uniform(5000, 15000, n_days)

# 044 최종 매출
sales = base_sales + trend + seasonality + weekly + holiday_boost + weather_effect + promo_effect
sales = np.maximum(sales, 10000)  # 최소 매출

# 044 DataFrame 생성
df = pd.DataFrame({
    'date': dates,
    'sales': sales.astype(int),
    'promotion': promotions,
    'temperature': np.random.uniform(0, 35, n_days).round(1),
    'is_weekend': [1 if d.weekday() >= 5 else 0 for d in dates],
    'is_holiday': [1 if d.month == 12 and d.day in range(20, 32) else 0 for d in dates]
})

# 044 날짜 특성 추출
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek
df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
df['quarter'] = df['date'].dt.quarter

print("데이터셋 정보:")
print(f"  기간: {df['date'].min().date()} ~ {df['date'].max().date()}")
print(f"  샘플 수: {len(df)}")
print(f"\n처음 5행:")
print(df.head())

Step 2: 탐색적 데이터 분석

매출 추이

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 044 전체 추이
axes[0, 0].plot(df['date'], df['sales'], alpha=0.7)
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Sales ($)')
axes[0, 0].set_title('Daily Sales Trend')

# 044 월별 평균
monthly = df.groupby('month')['sales'].mean()
axes[0, 1].bar(monthly.index, monthly.values)
axes[0, 1].set_xlabel('Month')
axes[0, 1].set_ylabel('Average Sales ($)')
axes[0, 1].set_title('Average Sales by Month')

# 044 요일별 평균
dow_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
daily = df.groupby('dayofweek')['sales'].mean()
axes[1, 0].bar(dow_names, daily.values)
axes[1, 0].set_xlabel('Day of Week')
axes[1, 0].set_ylabel('Average Sales ($)')
axes[1, 0].set_title('Average Sales by Day of Week')

# 044 프로모션 효과
promo_effect = df.groupby('promotion')['sales'].mean()
axes[1, 1].bar(['No Promo', 'Promo'], promo_effect.values)
axes[1, 1].set_xlabel('Promotion')
axes[1, 1].set_ylabel('Average Sales ($)')
axes[1, 1].set_title('Promotion Effect on Sales')

plt.tight_layout()
plt.show()

상관관계

# 044 상관관계 분석
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation = df[numeric_cols].corr()['sales'].sort_values(ascending=False)

print("매출과의 상관관계:")
print(correlation)

Step 3: 특성 엔지니어링

# 044 추가 특성 생성
df_features = df.copy()

# 044 이동 평균 (과거 데이터만 사용)
df_features['sales_ma7'] = df_features['sales'].shift(1).rolling(7).mean()
df_features['sales_ma30'] = df_features['sales'].shift(1).rolling(30).mean()

# 044 지연 특성 (lag features)
for lag in [1, 7, 14, 30]:
    df_features[f'sales_lag{lag}'] = df_features['sales'].shift(lag)

# 044 결측치 제거 (첫 30일)
df_features = df_features.dropna()

print(f"특성 엔지니어링 후 샘플 수: {len(df_features)}")
print(f"특성 수: {df_features.shape[1] - 2}")  # date, sales 제외

Step 4: 데이터 분할

from sklearn.model_selection import train_test_split

# 044 특성과 타겟 분리
feature_cols = [col for col in df_features.columns if col not in ['date', 'sales']]
X = df_features[feature_cols]
y = df_features['sales']

# 044 시간 기반 분할 (마지막 60일을 테스트로)
split_idx = len(df_features) - 60

X_train = X.iloc[:split_idx]
X_test = X.iloc[split_idx:]
y_train = y.iloc[:split_idx]
y_test = y.iloc[split_idx:]
dates_test = df_features['date'].iloc[split_idx:]

print(f"학습 데이터: {len(X_train)}개")
print(f"테스트 데이터: {len(X_test)}개")
print(f"테스트 기간: {dates_test.min().date()} ~ {dates_test.max().date()}")

Step 5: FLAML AutoML 학습

from flaml import AutoML

automl = AutoML()
automl.fit(
    X_train, y_train,
    task="regression",
    time_budget=120,
    metric="mape",  # 비율 기반 오차
    seed=42,
    verbose=1
)

print(f"\n최적 모델: {automl.best_estimator}")
print(f"검증 MAPE: {automl.best_loss * 100:.2f}%")

Step 6: 모델 평가

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 044 예측
y_pred = automl.predict(X_test)

# 044 평가 지표
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("테스트 성능:")
print(f"  RMSE: ${rmse:,.0f}")
print(f"  MAE: ${mae:,.0f}")
print(f"  R²: {r2:.4f}")
print(f"  MAPE: {mape:.2f}%")

예측 시각화

fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# 044 시계열 비교
axes[0].plot(dates_test, y_test.values, label='Actual', linewidth=2)
axes[0].plot(dates_test, y_pred, label='Predicted', linewidth=2, alpha=0.8)
axes[0].fill_between(dates_test, y_pred * 0.9, y_pred * 1.1, alpha=0.2, label='±10% Band')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Sales ($)')
axes[0].set_title('Sales Prediction vs Actual')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 044 오차 분포
errors = y_test.values - y_pred
axes[1].bar(dates_test, errors, alpha=0.7)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Error ($)')
axes[1].set_title('Prediction Errors')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Step 7: 특성 중요도

if hasattr(automl.best_model, 'feature_importances_'):
    importance = automl.best_model.feature_importances_
    feature_names = X.columns

    # 상위 10개
    sorted_idx = np.argsort(importance)[::-1][:10]

    plt.figure(figsize=(10, 6))
    plt.barh(range(len(sorted_idx)), importance[sorted_idx][::-1])
    plt.yticks(range(len(sorted_idx)), feature_names[sorted_idx][::-1])
    plt.xlabel('Importance')
    plt.title('Top 10 Feature Importance for Sales Prediction')
    plt.tight_layout()
    plt.show()

    print("상위 5개 중요 특성:")
    for i in sorted_idx[:5]:
        print(f"  {feature_names[i]}: {importance[i]:.4f}")

Step 8: 비즈니스 인사이트

# 044 요일별 예측 정확도
df_result = pd.DataFrame({
    'date': dates_test.values,
    'actual': y_test.values,
    'predicted': y_pred,
    'error': y_test.values - y_pred,
    'abs_error': np.abs(y_test.values - y_pred),
    'dayofweek': [d.dayofweek for d in dates_test]
})

dow_accuracy = df_result.groupby('dayofweek').agg({
    'abs_error': 'mean',
    'actual': 'mean'
})
dow_accuracy['mape'] = dow_accuracy['abs_error'] / dow_accuracy['actual'] * 100

print("요일별 예측 성능:")
print(dow_accuracy.round(2))

# 044 프로모션 여부에 따른 정확도
df_result['promotion'] = X_test['promotion'].values
promo_accuracy = df_result.groupby('promotion')['abs_error'].mean()
print(f"\n프로모션별 MAE:")
print(f"  프로모션 없음: ${promo_accuracy[0]:,.0f}")
print(f"  프로모션 있음: ${promo_accuracy[1]:,.0f}")

Step 9: 미래 예측 (다음 7일)

def forecast_next_days(model, last_data, n_days=7):
    """다음 n일 예측"""
    predictions = []
    current_data = last_data.copy()

    for i in range(n_days):
        # 다음 날 예측
        next_day = model.predict(current_data[feature_cols].iloc[[-1]])[0]
        predictions.append(next_day)

        # 데이터 업데이트 (시뮬레이션)
        # 실제로는 새로운 날짜의 특성이 필요

    return predictions

# 044 간단한 예측 (실제로는 더 복잡한 처리 필요)
print("\n다음 7일 예측 시뮬레이션:")
print("(실제 구현 시 날짜별 특성 생성 필요)")

# 044 마지막 주 평균으로 추정
last_week_avg = y_test.tail(7).mean()
print(f"참고: 마지막 주 평균 매출: ${last_week_avg:,.0f}")

Step 10: 모델 저장

import pickle

model_package = {
    'model': automl,
    'features': feature_cols,
    'metrics': {
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'mape': mape
    }
}

with open('sales_forecast_model.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print("모델 저장 완료: sales_forecast_model.pkl")

정리

매출 예측에서 시간 특성(월, 요일, 계절)이 중요합니다.
Lag 특성과 이동 평균이 예측력을 높입니다.
MAPE로 비율 기반 오차를 평가합니다.
프로모션, 휴일 등 외부 요인을 고려합니다.
시계열 데이터는 시간 기반 분할을 사용합니다.
비즈니스 인사이트를 위해 요일별, 이벤트별 분석을 수행합니다.

다음 글 예고

다음 글에서는 회귀에서의 앙상블 기법에 대해 알아보겠습니다. 여러 모델을 결합하여 예측 성능을 높이는 방법을 다룹니다.

FLAML AutoML 마스터 시리즈 #044

개요​

실습 환경​

프로젝트 개요​

목표​

비즈니스 가치​

Step 1: 데이터 준비​

Step 2: 탐색적 데이터 분석​

매출 추이​

상관관계​

Step 3: 특성 엔지니어링​

Step 4: 데이터 분할​

Step 5: FLAML AutoML 학습​

Step 6: 모델 평가​

예측 시각화​

Step 7: 특성 중요도​

Step 8: 비즈니스 인사이트​

Step 9: 미래 예측 (다음 7일)​

Step 10: 모델 저장​

정리​

다음 글 예고​

개요