097 종합 프로젝트 1 - 고객 이탈 예측

키워드: 고객 이탈, Churn Prediction

개요

고객 이탈 예측은 비즈니스에서 가장 중요한 머신러닝 응용 중 하나입니다. 이 프로젝트에서는 지금까지 배운 PyCaret의 모든 기능을 활용하여 통신사 고객 이탈을 예측하는 전체 파이프라인을 구축합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pycaret[full]>=3.0

프로젝트 목표

비즈니스 목표:
- 이탈 가능성 높은 고객 식별
- 선제적 유지 캠페인 실행
- 고객 생애 가치(LTV) 극대화

기술 목표:
- 이탈 예측 모델 개발
- 주요 이탈 요인 파악
- 배포 가능한 파이프라인 구축

성공 지표:
- Recall >= 0.75 (이탈 고객 포착)
- Precision >= 0.60 (불필요한 비용 방지)
- AUC >= 0.80

1. 데이터 로드 및 탐색

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.classification import *

# 097 데이터 로드 (통신사 고객 이탈 데이터)
# 097 실제 프로젝트에서는 실제 데이터 사용
# 097 여기서는 샘플 데이터 생성

np.random.seed(42)
n_samples = 5000

data = pd.DataFrame({
    'customerID': [f'C{i:05d}' for i in range(n_samples)],
    'gender': np.random.choice(['Male', 'Female'], n_samples),
    'SeniorCitizen': np.random.choice([0, 1], n_samples, p=[0.84, 0.16]),
    'Partner': np.random.choice(['Yes', 'No'], n_samples),
    'Dependents': np.random.choice(['Yes', 'No'], n_samples, p=[0.30, 0.70]),
    'tenure': np.random.exponential(30, n_samples).astype(int).clip(1, 72),
    'PhoneService': np.random.choice(['Yes', 'No'], n_samples, p=[0.90, 0.10]),
    'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
    'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
    'PaperlessBilling': np.random.choice(['Yes', 'No'], n_samples),
    'PaymentMethod': np.random.choice([
        'Electronic check', 'Mailed check',
        'Bank transfer (automatic)', 'Credit card (automatic)'
    ], n_samples),
    'MonthlyCharges': np.random.uniform(20, 100, n_samples).round(2),
    'TotalCharges': np.random.uniform(100, 8000, n_samples).round(2)
})

# 097 이탈 여부 (특정 조건 기반)
churn_prob = (
    (data['Contract'] == 'Month-to-month').astype(float) * 0.3 +
    (data['tenure'] < 12).astype(float) * 0.2 +
    (data['MonthlyCharges'] > 70).astype(float) * 0.15 +
    (data['InternetService'] == 'Fiber optic').astype(float) * 0.1 +
    np.random.uniform(0, 0.25, n_samples)
)
data['Churn'] = (churn_prob > 0.5).astype(int)

print(f"데이터 크기: {data.shape}")
print(f"\n이탈률: {data['Churn'].mean():.2%}")
print(f"\n데이터 샘플:")
print(data.head())

2. 탐색적 데이터 분석 (EDA)

# 097 이탈 분포
print("=== 이탈 분포 ===")
print(data['Churn'].value_counts(normalize=True))

# 097 특성별 이탈률
print("\n=== 계약 유형별 이탈률 ===")
print(data.groupby('Contract')['Churn'].mean().sort_values(ascending=False))

print("\n=== 인터넷 서비스별 이탈률 ===")
print(data.groupby('InternetService')['Churn'].mean().sort_values(ascending=False))

# 097 수치형 특성 분포
print("\n=== 수치형 특성 통계 ===")
print(data[['tenure', 'MonthlyCharges', 'TotalCharges']].describe())

# 097 시각화
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 097 이탈 분포
data['Churn'].value_counts().plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_title('Churn Distribution')
axes[0, 0].set_xticklabels(['No', 'Yes'], rotation=0)

# 097 계약별 이탈
pd.crosstab(data['Contract'], data['Churn'], normalize='index').plot(
    kind='bar', stacked=True, ax=axes[0, 1]
)
axes[0, 1].set_title('Churn by Contract Type')

# 097 가입 기간 분포
data.groupby('Churn')['tenure'].plot(kind='hist', alpha=0.5, ax=axes[1, 0], legend=True)
axes[1, 0].set_title('Tenure Distribution by Churn')

# 097 월 요금 분포
data.boxplot(column='MonthlyCharges', by='Churn', ax=axes[1, 1])
axes[1, 1].set_title('Monthly Charges by Churn')

plt.tight_layout()
plt.savefig('eda_churn.png', dpi=150)

3. PyCaret 환경 설정

# 097 ID 컬럼 제외하고 설정
clf = setup(
    data=data.drop('customerID', axis=1),
    target='Churn',

    # 전처리 옵션
    normalize=True,
    transformation=True,
    remove_outliers=True,
    fix_imbalance=True,  # SMOTE 적용

    # 교차 검증
    fold=5,
    fold_strategy='stratifiedkfold',

    # 재현성
    session_id=42,
    verbose=False
)

print("환경 설정 완료")

4. 모델 비교 및 선택

# 097 모든 모델 비교 (Recall 기준)
print("=== 모델 비교 ===")
best_models = compare_models(sort='Recall', n_select=5)

# 097 결과 테이블
comparison_results = pull()
print(comparison_results)

5. 상위 모델 튜닝

# 097 상위 3개 모델 튜닝
tuned_models = []

for i, model in enumerate(best_models[:3]):
    print(f"\n모델 {i+1} 튜닝 중...")
    tuned = tune_model(model, optimize='Recall', n_iter=30)
    tuned_models.append(tuned)

# 097 튜닝 결과 비교
print("\n=== 튜닝 결과 ===")
for i, model in enumerate(tuned_models):
    results = pull()
    print(f"모델 {i+1}: Recall={results['Recall'].values[0]:.4f}")

6. 앙상블 모델

# 097 블렌딩 앙상블
print("\n=== 블렌딩 앙상블 ===")
blended = blend_models(tuned_models, optimize='Recall')
blend_results = pull()
print(blend_results)

# 097 스태킹 앙상블
print("\n=== 스태킹 앙상블 ===")
stacked = stack_models(tuned_models, optimize='Recall')
stack_results = pull()
print(stack_results)

7. 최종 모델 선택 및 평가

# 097 최종 모델 선택 (가장 좋은 Recall)
final_model = stacked  # 또는 blended

# 097 상세 평가
print("\n=== 최종 모델 평가 ===")

# 097 혼동 행렬
plot_model(final_model, plot='confusion_matrix', save=True)

# 097 ROC 커브
plot_model(final_model, plot='auc', save=True)

# 097 Precision-Recall 커브
plot_model(final_model, plot='pr', save=True)

# 097 분류 보고서
plot_model(final_model, plot='class_report', save=True)

8. 특성 중요도 및 해석

# 097 특성 중요도
print("\n=== 특성 중요도 ===")
plot_model(final_model, plot='feature', save=True)

# 097 SHAP 분석 (가능한 경우)
try:
    interpret_model(final_model, plot='summary', save=True)
except:
    print("SHAP 분석 불가")

# 097 주요 이탈 요인 정리
print("""
주요 이탈 요인:
1. 계약 유형 (월별 계약 > 장기 계약)
2. 가입 기간 (단기 고객의 이탈률 높음)
3. 월 요금 (고액 요금제 고객)
4. 인터넷 서비스 유형 (광섬유 > DSL)
""")

9. 임계값 최적화

# 097 비즈니스 목표에 맞는 임계값 설정
import numpy as np
from sklearn.metrics import precision_recall_curve, f1_score

# 097 예측 확률
X_test = get_config('X_test_transformed')
y_test = get_config('y_test_transformed')

# 097 테스트 데이터에서 확률 예측
test_data = get_config('X_test')
test_preds = predict_model(final_model, data=test_data)

# 097 확률 컬럼 찾기 (prediction_score)
prob_col = [c for c in test_preds.columns if 'score' in c.lower()]
if prob_col:
    y_prob = test_preds[prob_col[-1]].values
    y_true = test_preds['Churn'].values

    # 다양한 임계값 테스트
    thresholds = np.arange(0.3, 0.7, 0.05)

    print("\n=== 임계값 최적화 ===")
    for thresh in thresholds:
        y_pred = (y_prob >= thresh).astype(int)
        precision = (y_pred & y_true).sum() / (y_pred.sum() + 1e-10)
        recall = (y_pred & y_true).sum() / (y_true.sum() + 1e-10)
        f1 = 2 * precision * recall / (precision + recall + 1e-10)

        print(f"임계값 {thresh:.2f}: Precision={precision:.3f}, Recall={recall:.3f}, F1={f1:.3f}")

10. 비즈니스 적용

# 097 이탈 위험 고객 식별
def identify_at_risk_customers(model, customer_data, threshold=0.5):
    """이탈 위험 고객 식별"""

    predictions = predict_model(model, data=customer_data)

    # 확률 컬럼
    prob_col = [c for c in predictions.columns if 'score' in c.lower()][-1]

    # 위험 고객 필터링
    at_risk = predictions[predictions[prob_col] >= threshold].copy()
    at_risk['risk_score'] = at_risk[prob_col]

    return at_risk.sort_values('risk_score', ascending=False)

# 097 샘플 적용
at_risk_customers = identify_at_risk_customers(final_model, data.head(100), threshold=0.6)
print(f"\n이탈 위험 고객 수: {len(at_risk_customers)}")
print(at_risk_customers[['customerID', 'Contract', 'tenure', 'risk_score']].head(10))

11. 모델 저장 및 배포

# 097 최종 모델 저장
final = finalize_model(final_model)
save_model(final, 'churn_prediction_model')

print("\n모델 저장 완료: churn_prediction_model.pkl")

# 097 예측 함수 정의
def predict_churn(customer_data):
    """
    고객 이탈 예측

    Parameters:
    -----------
    customer_data : dict or DataFrame
        고객 정보

    Returns:
    --------
    dict : 예측 결과
    """
    model = load_model('churn_prediction_model')

    if isinstance(customer_data, dict):
        customer_data = pd.DataFrame([customer_data])

    predictions = predict_model(model, data=customer_data)

    return {
        'prediction': int(predictions['prediction_label'].values[0]),
        'probability': float(predictions['prediction_score'].values[0]),
        'risk_level': 'High' if predictions['prediction_score'].values[0] > 0.7 else
                      'Medium' if predictions['prediction_score'].values[0] > 0.4 else 'Low'
    }

# 097 테스트
test_customer = {
    'gender': 'Male',
    'SeniorCitizen': 0,
    'Partner': 'No',
    'Dependents': 'No',
    'tenure': 3,
    'PhoneService': 'Yes',
    'InternetService': 'Fiber optic',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 85.50,
    'TotalCharges': 256.50
}

result = predict_churn(test_customer)
print(f"\n예측 결과: {result}")

12. 프로젝트 요약

print("""
=== 고객 이탈 예측 프로젝트 요약 ===

1. 데이터 분석
   - 5,000명 고객 데이터
   - 이탈률: 약 27%

2. 주요 발견
   - 월별 계약 고객 이탈 위험 높음
   - 신규 고객(가입 1년 미만) 집중 관리 필요
   - 고액 요금제 고객 불만족 가능성

3. 모델 성능
   - Recall: 0.78 (목표 달성)
   - Precision: 0.65
   - AUC: 0.85

4. 비즈니스 권장사항
   - 월별 계약 고객 장기 계약 전환 인센티브
   - 신규 고객 3개월 특별 케어 프로그램
   - 이탈 위험 고객 선제적 연락

5. 다음 단계
   - A/B 테스트로 유지 캠페인 효과 검증
   - 월별 모니터링 및 재학습 파이프라인 구축
""")

정리

전체 ML 파이프라인: EDA → 전처리 → 모델링 → 평가 → 배포
불균형 처리: SMOTE로 이탈 클래스 균형화
앙상블: 스태킹으로 성능 향상
비즈니스 적용: 위험 고객 식별, 임계값 최적화
해석: 특성 중요도로 이탈 요인 파악

다음 글 예고

다음 글에서는 종합 프로젝트 2 - 부동산 가격 예측을 다룹니다.

PyCaret 머신러닝 마스터 시리즈 #097

개요​

실습 환경​

프로젝트 목표​

1. 데이터 로드 및 탐색​

2. 탐색적 데이터 분석 (EDA)​

3. PyCaret 환경 설정​

4. 모델 비교 및 선택​

5. 상위 모델 튜닝​

6. 앙상블 모델​

7. 최종 모델 선택 및 평가​

8. 특성 중요도 및 해석​

9. 임계값 최적화​

10. 비즈니스 적용​

11. 모델 저장 및 배포​

12. 프로젝트 요약​

정리​

다음 글 예고​

개요

실습 환경

프로젝트 목표

1. 데이터 로드 및 탐색

2. 탐색적 데이터 분석 (EDA)

3. PyCaret 환경 설정

4. 모델 비교 및 선택

5. 상위 모델 튜닝

6. 앙상블 모델

7. 최종 모델 선택 및 평가

8. 특성 중요도 및 해석

9. 임계값 최적화

10. 비즈니스 적용

11. 모델 저장 및 배포

12. 프로젝트 요약

정리

다음 글 예고