053 회귀 실전 - 매출 예측

키워드: 매출, 예측

개요

매출 예측은 비즈니스에서 가장 중요한 예측 과제 중 하나입니다. 이 글에서는 PyCaret을 활용하여 소매업 매출을 예측하는 실전 프로젝트를 진행합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pycaret[full]>=3.0

비즈니스 문제 정의

목표: 매장의 주간 매출 예측 활용: 재고 관리, 인력 배치, 마케팅 예산 수립

데이터 준비

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# 053 매출 데이터 시뮬레이션
np.random.seed(42)
n_samples = 1000

# 053 날짜 생성
start_date = datetime(2022, 1, 1)
dates = [start_date + timedelta(weeks=i % 52) for i in range(n_samples)]

# 053 특성 생성
data = pd.DataFrame({
    'store_id': np.random.choice([1, 2, 3, 4, 5], n_samples),
    'week_of_year': [d.isocalendar()[1] for d in dates],
    'month': [d.month for d in dates],
    'is_holiday': np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
    'temperature': np.random.normal(15, 10, n_samples).clip(-10, 40),
    'fuel_price': np.random.uniform(2.5, 4.5, n_samples),
    'unemployment': np.random.uniform(4, 10, n_samples),
    'cpi': np.random.uniform(200, 250, n_samples),
    'promotion': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]),
    'competitor_distance': np.random.uniform(0.5, 20, n_samples)
})

# 053 타겟 생성 (매출)
base_sales = 50000
data['weekly_sales'] = (
    base_sales
    + data['store_id'] * 5000  # 매장별 기본 매출
    + data['is_holiday'] * 15000  # 휴일 효과
    + data['promotion'] * 8000  # 프로모션 효과
    - data['unemployment'] * 1000  # 실업률 부정적 영향
    + np.sin(data['week_of_year'] / 52 * 2 * np.pi) * 10000  # 계절성
    + np.random.normal(0, 5000, n_samples)  # 노이즈
).clip(0)

print(f"데이터 크기: {len(data)}")
print(f"매출 범위: ${data['weekly_sales'].min():,.0f} ~ ${data['weekly_sales'].max():,.0f}")
print(f"평균 매출: ${data['weekly_sales'].mean():,.0f}")

탐색적 데이터 분석

import matplotlib.pyplot as plt
import seaborn as sns

# 053 매출 분포
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 053 전체 매출 분포
axes[0, 0].hist(data['weekly_sales'], bins=30, edgecolor='black')
axes[0, 0].set_title('Weekly Sales Distribution')
axes[0, 0].set_xlabel('Sales ($)')

# 053 매장별 매출
data.boxplot(column='weekly_sales', by='store_id', ax=axes[0, 1])
axes[0, 1].set_title('Sales by Store')
axes[0, 1].set_xlabel('Store ID')

# 053 휴일 효과
data.boxplot(column='weekly_sales', by='is_holiday', ax=axes[1, 0])
axes[1, 0].set_title('Sales: Holiday vs Non-Holiday')
axes[1, 0].set_xlabel('Is Holiday')

# 053 프로모션 효과
data.boxplot(column='weekly_sales', by='promotion', ax=axes[1, 1])
axes[1, 1].set_title('Sales: Promotion vs No Promotion')
axes[1, 1].set_xlabel('Promotion')

plt.tight_layout()
plt.savefig('sales_eda.png', dpi=150)

PyCaret 설정

from pycaret.regression import *

# 053 매장 ID를 범주형으로 변환
data['store_id'] = data['store_id'].astype(str)

# 053 환경 설정
reg = setup(
    data=data,
    target='weekly_sales',
    categorical_features=['store_id'],
    numeric_features=['week_of_year', 'month', 'temperature',
                      'fuel_price', 'unemployment', 'cpi',
                      'competitor_distance'],
    session_id=42,
    verbose=False
)

print("설정 완료!")

모델 비교

# 053 상위 5개 모델 비교
best_models = compare_models(n_select=5)

최적 모델 선택 및 튜닝

# 053 LightGBM 선택 (일반적으로 매출 예측에 효과적)
lgbm = create_model('lightgbm', verbose=False)
print("LightGBM 기본 성능:")
print(pull())

# 053 하이퍼파라미터 튜닝
tuned_lgbm = tune_model(lgbm, optimize='RMSE')
print("\n튜닝 후 성능:")
print(pull())

특성 중요도 분석

import pandas as pd

# 053 특성 중요도
feature_names = get_config('X_train').columns
importances = tuned_lgbm.feature_importances_

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print("특성 중요도:")
print(importance_df)

# 053 시각화
plot_model(tuned_lgbm, plot='feature')

모델 해석 (SHAP)

import shap

# 053 SHAP 값 계산
X_test = get_config('X_test')
explainer = shap.TreeExplainer(tuned_lgbm)
shap_values = explainer.shap_values(X_test)

# 053 Summary Plot
shap.summary_plot(shap_values, X_test)

# 053 주요 인사이트:
# 1. 휴일(is_holiday)이 매출에 가장 큰 영향
# 2. 프로모션(promotion)도 중요한 요인
# 3. 실업률(unemployment)은 부정적 영향

잔차 분석

# 053 잔차 분석
plot_model(tuned_lgbm, plot='residuals')

# 053 예측 오차
plot_model(tuned_lgbm, plot='error')

앙상블 모델

# 053 다양한 모델 생성
xgb = create_model('xgboost', verbose=False)
rf = create_model('rf', verbose=False)
gbr = create_model('gbr', verbose=False)

# 053 블렌딩
blended = blend_models([tuned_lgbm, xgb, rf])
print("블렌딩 모델 성능:")
print(pull())

# 053 스태킹
stacked = stack_models([tuned_lgbm, xgb, rf])
print("\n스태킹 모델 성능:")
print(pull())

최종 모델 및 예측

# 053 최종 모델 (전체 데이터로 재학습)
final_model = finalize_model(tuned_lgbm)

# 053 새로운 데이터 예측
new_data = pd.DataFrame({
    'store_id': ['1', '2', '3'],
    'week_of_year': [25, 25, 25],
    'month': [6, 6, 6],
    'is_holiday': [0, 0, 1],
    'temperature': [25, 25, 25],
    'fuel_price': [3.5, 3.5, 3.5],
    'unemployment': [5.5, 5.5, 5.5],
    'cpi': [230, 230, 230],
    'promotion': [0, 1, 1],
    'competitor_distance': [5, 10, 3]
})

predictions = predict_model(final_model, data=new_data)
print("매출 예측:")
print(predictions[['store_id', 'is_holiday', 'promotion', 'prediction_label']])

예측 구간 추정

import numpy as np

# 053 부트스트랩 예측 구간
def bootstrap_prediction_interval(model, X, n_bootstrap=100, alpha=0.05):
    predictions = []

    for _ in range(n_bootstrap):
        # 부트스트랩 샘플링
        idx = np.random.choice(len(X), size=len(X), replace=True)
        X_boot = X.iloc[idx]

        # 예측
        pred = predict_model(model, data=X_boot, verbose=False)
        predictions.append(pred['prediction_label'].values)

    predictions = np.array(predictions)

    lower = np.percentile(predictions, alpha/2 * 100, axis=0)
    upper = np.percentile(predictions, (1 - alpha/2) * 100, axis=0)
    mean_pred = np.mean(predictions, axis=0)

    return mean_pred, lower, upper

# 053 예측 구간 계산
mean_pred, lower, upper = bootstrap_prediction_interval(final_model, new_data)

print("예측 (95% 신뢰구간):")
for i in range(len(new_data)):
    print(f"  매장 {new_data['store_id'].iloc[i]}: ${mean_pred[i]:,.0f} [${lower[i]:,.0f}, ${upper[i]:,.0f}]")

비즈니스 인사이트

# 053 주요 인사이트 정리
print("=== 매출 예측 비즈니스 인사이트 ===\n")

# 1. 휴일 효과
holiday_effect = data[data['is_holiday']==1]['weekly_sales'].mean() - \
                 data[data['is_holiday']==0]['weekly_sales'].mean()
print(f"1. 휴일 효과: +${holiday_effect:,.0f}/주")

# 2. 프로모션 효과
promo_effect = data[data['promotion']==1]['weekly_sales'].mean() - \
               data[data['promotion']==0]['weekly_sales'].mean()
print(f"2. 프로모션 효과: +${promo_effect:,.0f}/주")

# 3. 매장별 평균 매출
print("\n3. 매장별 평균 매출:")
store_sales = data.groupby('store_id')['weekly_sales'].mean().sort_values(ascending=False)
for store, sales in store_sales.items():
    print(f"   매장 {store}: ${sales:,.0f}")

# 4. 계절성
print("\n4. 월별 평균 매출:")
monthly_sales = data.groupby('month')['weekly_sales'].mean()
best_month = monthly_sales.idxmax()
worst_month = monthly_sales.idxmin()
print(f"   최고: {best_month}월 (${monthly_sales[best_month]:,.0f})")
print(f"   최저: {worst_month}월 (${monthly_sales[worst_month]:,.0f})")

모델 저장

# 053 모델 저장
save_model(final_model, 'sales_prediction_model')
print("모델 저장 완료: sales_prediction_model.pkl")

# 053 모델 로드
loaded_model = load_model('sales_prediction_model')

# 053 예측 테스트
test_pred = predict_model(loaded_model, data=new_data)
print("\n로드된 모델 예측 확인 완료")

실무 적용 팁

데이터 품질: 매출 데이터의 정확성 확인 (반품, 취소 처리)
외부 변수: 날씨, 경제 지표, 경쟁사 활동 반영
계절성: 연간 패턴 충분히 포함 (최소 2년 데이터 권장)
업데이트: 주기적 재학습 (주간/월간)
모니터링: 예측 vs 실제 비교로 모델 성능 추적

정리

매출 예측은 회귀 문제의 대표적 비즈니스 활용 사례
휴일, 프로모션, 계절성이 주요 영향 요인
LightGBM/XGBoost가 효과적
SHAP으로 예측 근거 설명 가능
예측 구간으로 불확실성 표현

다음 글 예고

다음 글에서는 회귀 실전 - 수요 예측을 다룹니다.

PyCaret 머신러닝 마스터 시리즈 #053

개요​

실습 환경​

비즈니스 문제 정의​

데이터 준비​

탐색적 데이터 분석​

PyCaret 설정​

모델 비교​

최적 모델 선택 및 튜닝​

특성 중요도 분석​

모델 해석 (SHAP)​

잔차 분석​

앙상블 모델​

최종 모델 및 예측​

예측 구간 추정​

비즈니스 인사이트​

모델 저장​

실무 적용 팁​

정리​

다음 글 예고​

개요

실습 환경

비즈니스 문제 정의

데이터 준비

탐색적 데이터 분석

PyCaret 설정

모델 비교

최적 모델 선택 및 튜닝

특성 중요도 분석

모델 해석 (SHAP)

잔차 분석

앙상블 모델

최종 모델 및 예측

예측 구간 추정

비즈니스 인사이트

모델 저장

실무 적용 팁

정리

다음 글 예고