029 분류 프로젝트 - 고객 이탈 예측

키워드: 고객 이탈, Churn, 마케팅

개요

고객 이탈(Customer Churn) 예측은 기업에서 매우 중요한 문제입니다. 이탈 가능성이 높은 고객을 사전에 파악하면 맞춤형 마케팅으로 이탈을 방지할 수 있습니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], pandas, scikit-learn

pip install flaml[automl] pandas scikit-learn matplotlib

프로젝트 개요

비즈니스 배경

신규 고객 획득 비용 > 기존 고객 유지 비용 (약 5~7배)
이탈 고객의 5%만 유지해도 수익 25~95% 증가 가능
이탈 예측으로 선제적 마케팅 가능

목표

고객의 행동 패턴과 특성으로 이탈 여부 예측

Step 1: 데이터 준비

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 029 가상의 통신사 고객 데이터 생성
np.random.seed(42)
n_samples = 5000

# 029 고객 데이터 생성
data = {
    'customer_id': range(1, n_samples + 1),
    'tenure': np.random.randint(1, 72, n_samples),  # 가입 기간 (월)
    'monthly_charges': np.random.uniform(20, 100, n_samples),
    'total_charges': np.random.uniform(100, 5000, n_samples),
    'contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples, p=[0.5, 0.3, 0.2]),
    'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples, p=[0.4, 0.4, 0.2]),
    'online_security': np.random.choice(['Yes', 'No', 'No internet'], n_samples),
    'tech_support': np.random.choice(['Yes', 'No', 'No internet'], n_samples),
    'paperless_billing': np.random.choice(['Yes', 'No'], n_samples),
    'payment_method': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_samples),
    'senior_citizen': np.random.choice([0, 1], n_samples, p=[0.84, 0.16]),
    'partner': np.random.choice(['Yes', 'No'], n_samples),
    'dependents': np.random.choice(['Yes', 'No'], n_samples),
}

df = pd.DataFrame(data)

# 029 이탈 여부 생성 (실제 패턴 반영)
def generate_churn(row):
    prob = 0.15  # 기본 이탈률

    # 계약 유형에 따른 조정
    if row['contract'] == 'Month-to-month':
        prob += 0.25
    elif row['contract'] == 'Two year':
        prob -= 0.10

    # 가입 기간에 따른 조정
    if row['tenure'] < 12:
        prob += 0.15
    elif row['tenure'] > 48:
        prob -= 0.10

    # 월 청구액에 따른 조정
    if row['monthly_charges'] > 70:
        prob += 0.10

    # 기술 지원 여부
    if row['tech_support'] == 'No':
        prob += 0.05

    return 1 if np.random.random() < prob else 0

df['churn'] = df.apply(generate_churn, axis=1)

print("데이터셋 정보:")
print(f"  전체 고객: {len(df):,}명")
print(f"  유지 고객: {(df['churn'] == 0).sum():,}명 ({(df['churn'] == 0).mean()*100:.1f}%)")
print(f"  이탈 고객: {(df['churn'] == 1).sum():,}명 ({(df['churn'] == 1).mean()*100:.1f}%)")

Step 2: 탐색적 데이터 분석

import matplotlib.pyplot as plt

# 029 주요 특성별 이탈률 분석
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 029 계약 유형별 이탈률
churn_by_contract = df.groupby('contract')['churn'].mean()
axes[0, 0].bar(churn_by_contract.index, churn_by_contract.values, color='coral')
axes[0, 0].set_title('계약 유형별 이탈률')
axes[0, 0].set_ylabel('이탈률')

# 029 가입 기간별 이탈률
df['tenure_group'] = pd.cut(df['tenure'], bins=[0, 12, 24, 48, 72], labels=['0-12월', '12-24월', '24-48월', '48-72월'])
churn_by_tenure = df.groupby('tenure_group')['churn'].mean()
axes[0, 1].bar(churn_by_tenure.index, churn_by_tenure.values, color='steelblue')
axes[0, 1].set_title('가입 기간별 이탈률')
axes[0, 1].set_ylabel('이탈률')

# 029 월 청구액별 이탈률
df['charges_group'] = pd.cut(df['monthly_charges'], bins=[0, 35, 55, 75, 100], labels=['~35', '35-55', '55-75', '75~'])
churn_by_charges = df.groupby('charges_group')['churn'].mean()
axes[1, 0].bar(churn_by_charges.index, churn_by_charges.values, color='green')
axes[1, 0].set_title('월 청구액별 이탈률')
axes[1, 0].set_ylabel('이탈률')

# 029 인터넷 서비스별 이탈률
churn_by_internet = df.groupby('internet_service')['churn'].mean()
axes[1, 1].bar(churn_by_internet.index, churn_by_internet.values, color='purple')
axes[1, 1].set_title('인터넷 서비스별 이탈률')
axes[1, 1].set_ylabel('이탈률')

plt.tight_layout()
plt.show()

Step 3: 데이터 전처리

# 029 범주형 변수 인코딩
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()

# 029 범주형 컬럼
categorical_cols = ['contract', 'internet_service', 'online_security',
                   'tech_support', 'paperless_billing', 'payment_method',
                   'partner', 'dependents']

# 029 Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

# 029 특성 선택
feature_cols = ['tenure', 'monthly_charges', 'total_charges', 'contract',
               'internet_service', 'online_security', 'tech_support',
               'paperless_billing', 'payment_method', 'senior_citizen',
               'partner', 'dependents']

X = df_encoded[feature_cols]
y = df_encoded['churn']

# 029 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"학습 데이터: {len(X_train)}명")
print(f"테스트 데이터: {len(X_test)}명")

Step 4: FLAML AutoML 학습

from flaml import AutoML
from sklearn.utils.class_weight import compute_sample_weight

# 029 클래스 가중치 계산
sample_weights = compute_sample_weight('balanced', y_train)

# 029 FLAML 학습
automl = AutoML()
automl.fit(
    X_train, y_train,
    task="classification",
    time_budget=120,
    metric="f1",
    sample_weight=sample_weights,
    seed=42,
    verbose=1
)

print(f"\n최적 모델: {automl.best_estimator}")
print(f"검증 F1: {1 - automl.best_loss:.4f}")

Step 5: 모델 평가

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import seaborn as sns

# 029 예측
y_pred = automl.predict(X_test)
y_prob = automl.predict_proba(X_test)[:, 1]

# 029 분류 리포트
print("분류 리포트:")
print(classification_report(y_test, y_pred, target_names=['유지', '이탈']))

# 029 AUC
auc = roc_auc_score(y_test, y_prob)
print(f"ROC AUC: {auc:.4f}")

# 029 혼동 행렬
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['유지', '이탈'],
            yticklabels=['유지', '이탈'])
plt.xlabel('예측')
plt.ylabel('실제')
plt.title('고객 이탈 예측 - 혼동 행렬')
plt.tight_layout()
plt.show()

Step 6: 특성 중요도 분석

if hasattr(automl.best_model, 'feature_importances_'):
    importance = automl.best_model.feature_importances_

    # 정렬
    sorted_idx = np.argsort(importance)[::-1]

    plt.figure(figsize=(10, 6))
    plt.bar(range(len(importance)), importance[sorted_idx])
    plt.xticks(range(len(importance)),
               np.array(feature_cols)[sorted_idx], rotation=45, ha='right')
    plt.xlabel('Feature')
    plt.ylabel('Importance')
    plt.title('고객 이탈 예측 - 특성 중요도')
    plt.tight_layout()
    plt.show()

    print("\n특성 중요도 순위:")
    for i, idx in enumerate(sorted_idx[:5], 1):
        print(f"  {i}. {feature_cols[idx]}: {importance[idx]:.4f}")

Step 7: 이탈 확률 기반 세그먼트

# 029 이탈 확률 추가
df_test = X_test.copy()
df_test['actual_churn'] = y_test.values
df_test['churn_prob'] = y_prob
df_test['predicted_churn'] = y_pred

# 029 위험도 세그먼트
def risk_segment(prob):
    if prob >= 0.7:
        return '고위험'
    elif prob >= 0.4:
        return '중위험'
    else:
        return '저위험'

df_test['risk_segment'] = df_test['churn_prob'].apply(risk_segment)

# 029 세그먼트별 분석
print("세그먼트별 분석:")
print("-" * 50)
segment_analysis = df_test.groupby('risk_segment').agg({
    'actual_churn': ['count', 'sum', 'mean'],
    'churn_prob': 'mean'
}).round(4)
print(segment_analysis)

# 029 세그먼트별 고객 수 시각화
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

segment_counts = df_test['risk_segment'].value_counts()
colors = {'고위험': 'red', '중위험': 'orange', '저위험': 'green'}
axes[0].bar(segment_counts.index, segment_counts.values,
            color=[colors[s] for s in segment_counts.index])
axes[0].set_title('위험 세그먼트별 고객 수')
axes[0].set_ylabel('고객 수')

# 029 세그먼트별 실제 이탈률
segment_churn = df_test.groupby('risk_segment')['actual_churn'].mean()
axes[1].bar(segment_churn.index, segment_churn.values,
            color=[colors[s] for s in segment_churn.index])
axes[1].set_title('위험 세그먼트별 실제 이탈률')
axes[1].set_ylabel('이탈률')

plt.tight_layout()
plt.show()

Step 8: 비즈니스 가치 분석

# 029 비용-편익 분석
def business_analysis(y_true, y_prob, threshold=0.5,
                      cost_retain=50,      # 유지 마케팅 비용
                      value_retained=200,  # 고객 유지 가치
                      value_lost=500):     # 고객 이탈 손실
    """비즈니스 가치 분석"""
    y_pred = (y_prob >= threshold).astype(int)

    from sklearn.metrics import confusion_matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # 비용 계산
    # TP: 이탈 예측 → 마케팅 → 유지 성공 (가정: 50% 유지)
    # FP: 유지 예측했지만 이탈 → 불필요한 마케팅
    # FN: 이탈 예측 못함 → 고객 손실
    # TN: 유지 예측 정확 → 비용 없음

    revenue_tp = tp * 0.5 * value_retained - tp * cost_retain  # 50% 유지 성공
    cost_fp = fp * cost_retain  # 불필요한 마케팅
    loss_fn = fn * value_lost   # 놓친 이탈 고객

    total_value = revenue_tp - cost_fp - loss_fn

    return {
        'threshold': threshold,
        'tp': tp, 'fp': fp, 'fn': fn, 'tn': tn,
        'revenue_tp': revenue_tp,
        'cost_fp': cost_fp,
        'loss_fn': loss_fn,
        'total_value': total_value
    }

# 029 다양한 임계값에서 분석
print("임계값별 비즈니스 가치:")
print("-" * 80)
print(f"{'임계값':<10} {'TP':<6} {'FP':<6} {'FN':<6} {'수익':<12} {'비용':<12} {'손실':<12} {'순가치':<12}")
print("-" * 80)

best_value = float('-inf')
best_threshold = 0.5

for threshold in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]:
    result = business_analysis(y_test, y_prob, threshold)
    print(f"{threshold:<10} {result['tp']:<6} {result['fp']:<6} {result['fn']:<6} "
          f"${result['revenue_tp']:<11,.0f} ${result['cost_fp']:<11,.0f} "
          f"${result['loss_fn']:<11,.0f} ${result['total_value']:<11,.0f}")

    if result['total_value'] > best_value:
        best_value = result['total_value']
        best_threshold = threshold

print(f"\n최적 임계값: {best_threshold} (순가치: ${best_value:,.0f})")

Step 9: 고위험 고객 리스트

# 029 고위험 고객 추출
high_risk = df_test[df_test['risk_segment'] == '고위험'].sort_values('churn_prob', ascending=False)

print("고위험 고객 Top 10:")
print("-" * 60)
print(high_risk[['tenure', 'monthly_charges', 'contract', 'churn_prob']].head(10))

정리

고객 이탈 예측은 비즈니스에 큰 가치를 제공합니다.
F1 Score와 AUC로 모델을 평가합니다.
특성 중요도로 이탈에 영향을 주는 요인을 파악합니다.
위험 세그먼트로 고객을 분류하여 차별화된 대응이 가능합니다.
비용-편익 분석으로 최적 임계값을 결정합니다.
단기 계약, 짧은 가입 기간, 높은 요금이 이탈의 주요 원인입니다.

다음 글 예고

다음 글에서는 분류 프로젝트 - 스팸 메일 분류에 대해 알아보겠습니다. 텍스트 데이터를 활용한 스팸 분류 문제를 FLAML로 해결합니다.

FLAML AutoML 마스터 시리즈 #029

개요​

실습 환경​

프로젝트 개요​

비즈니스 배경​

목표​

Step 1: 데이터 준비​

Step 2: 탐색적 데이터 분석​

Step 3: 데이터 전처리​

Step 4: FLAML AutoML 학습​

Step 5: 모델 평가​

Step 6: 특성 중요도 분석​

Step 7: 이탈 확률 기반 세그먼트​

Step 8: 비즈니스 가치 분석​

Step 9: 고위험 고객 리스트​

정리​

다음 글 예고​

개요