038 분류 실전 - 고객 이탈 예측

키워드: 고객 이탈, churn

개요

고객 이탈(Churn) 예측은 가장 흔한 비즈니스 분류 문제입니다. 어떤 고객이 서비스를 떠날지 예측하여 선제적으로 대응할 수 있습니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pycaret[full]>=3.0

프로젝트 개요

목표: 통신사 고객 이탈 예측

비즈니스 가치:

이탈 고객 사전 식별
리텐션 캠페인 타겟팅
고객 생애 가치(LTV) 향상

데이터 로드 및 탐색

from pycaret.classification import *
from pycaret.datasets import get_data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 038 데이터 로드 (telecom churn 데이터)
# 038 PyCaret 내장 데이터 사용
data = pd.read_csv('https://raw.githubusercontent.com/IBM/watson-studio-sample-datasets/master/telco-customer-churn/Telco-Customer-Churn.csv')

print(f"데이터 크기: {data.shape}")
print(f"\n컬럼 목록:")
print(data.columns.tolist())

데이터 전처리

# 038 불필요한 컬럼 제거
data = data.drop('customerID', axis=1)

# 038 TotalCharges 수치형 변환
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# 038 결측치 확인
print("결측치:")
print(data.isnull().sum())

# 038 타겟 변수 분포
print(f"\n이탈 분포:")
print(data['Churn'].value_counts(normalize=True))

# 038 시각화
plt.figure(figsize=(6, 4))
data['Churn'].value_counts().plot(kind='bar')
plt.title('Churn Distribution')
plt.xlabel('Churn')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('churn_distribution.png', dpi=150)

탐색적 데이터 분석

# 038 수치형 변수와 이탈 관계
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, col in enumerate(numerical_cols):
    data.boxplot(column=col, by='Churn', ax=axes[idx])
    axes[idx].set_title(col)

plt.suptitle('')
plt.tight_layout()
plt.savefig('churn_numerical.png', dpi=150)

# 038 범주형 변수와 이탈 관계
categorical_cols = ['Contract', 'PaymentMethod', 'InternetService']

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, col in enumerate(categorical_cols):
    ct = pd.crosstab(data[col], data['Churn'], normalize='index')
    ct.plot(kind='bar', ax=axes[idx])
    axes[idx].set_title(f'{col} vs Churn')
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=45)

plt.tight_layout()
plt.savefig('churn_categorical.png', dpi=150)

PyCaret 설정

# 038 환경 설정
clf = setup(
    data=data,
    target='Churn',
    # 범주형 명시
    categorical_features=['gender', 'SeniorCitizen', 'Partner', 'Dependents',
                         'PhoneService', 'MultipleLines', 'InternetService',
                         'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                         'TechSupport', 'StreamingTV', 'StreamingMovies',
                         'Contract', 'PaperlessBilling', 'PaymentMethod'],
    # 결측치 처리
    numeric_imputation='median',
    # 불균형 처리
    fix_imbalance=True,
    # 재현성
    session_id=42,
    verbose=False
)

모델 비교

# 038 모델 비교 (Recall 중시 - 이탈 고객 놓치면 안 됨)
print("=== 모델 비교 ===")
best_models = compare_models(sort='AUC', n_select=5)

# 038 상위 5개 모델 확인
for i, model in enumerate(best_models):
    print(f"{i+1}. {type(model).__name__}")

최고 모델 튜닝

# 038 최고 모델 튜닝
print("\n=== 모델 튜닝 ===")
tuned_model = tune_model(best_models[0], optimize='AUC')

앙상블

# 038 앙상블 (상위 3개 블렌딩)
print("\n=== 앙상블 ===")
blended = blend_models(best_models[:3])

모델 평가

# 038 혼동 행렬
plot_model(blended, plot='confusion_matrix', save=True)

# 038 ROC 곡선
plot_model(blended, plot='auc', save=True)

# 038 특성 중요도
plot_model(blended, plot='feature', save=True)

# 038 클래스 리포트
plot_model(blended, plot='class_report', save=True)

모델 해석 (SHAP)

# 038 단일 모델로 해석 (앙상블은 해석 어려움)
best_single = best_models[0]

# 038 SHAP 해석
interpret_model(best_single)

예측 결과 분석

# 038 테스트 데이터 예측
predictions = predict_model(blended)

# 038 이탈 확률 분포
plt.figure(figsize=(10, 6))
predictions.groupby('Churn')['prediction_score'].hist(bins=50, alpha=0.7, label=['No', 'Yes'])
plt.xlabel('Predicted Churn Probability')
plt.ylabel('Count')
plt.title('Predicted Probability Distribution by Actual Churn')
plt.legend(['No Churn', 'Churn'])
plt.tight_layout()
plt.savefig('prediction_distribution.png', dpi=150)

비즈니스 임계값 설정

# 038 기본 임계값 0.5가 아닌 비즈니스에 맞는 임계값 설정
import numpy as np
from sklearn.metrics import precision_recall_curve

y_true = (predictions['Churn'] == 'Yes').astype(int)
y_score = predictions['prediction_score']

precision, recall, thresholds = precision_recall_curve(y_true, y_score)

# 038 Recall 80% 이상인 지점 찾기
target_recall = 0.8
idx = np.argmin(np.abs(recall - target_recall))
optimal_threshold = thresholds[idx]

print(f"Recall {target_recall*100}% 달성 임계값: {optimal_threshold:.4f}")
print(f"해당 임계값의 Precision: {precision[idx]:.4f}")

# 038 새 임계값으로 예측
predictions['custom_prediction'] = (predictions['prediction_score'] >= optimal_threshold).astype(int)

고위험 고객 세그먼트

# 038 고위험 고객 (이탈 확률 > 0.7)
high_risk = predictions[predictions['prediction_score'] > 0.7]
print(f"고위험 고객 수: {len(high_risk)}")

# 038 고위험 고객 특성 분석
print("\n고위험 고객 특성:")
print(high_risk.describe())

최종 모델 저장

# 038 최종화
final_model = finalize_model(blended)

# 038 저장
save_model(final_model, 'churn_model')

print("모델 저장 완료: churn_model.pkl")

새 고객 예측

# 038 모델 로드
loaded_model = load_model('churn_model')

# 038 새 고객 데이터
new_customer = pd.DataFrame({
    'gender': ['Male'],
    'SeniorCitizen': [0],
    'Partner': ['Yes'],
    'Dependents': ['No'],
    'tenure': [12],
    'PhoneService': ['Yes'],
    'MultipleLines': ['No'],
    'InternetService': ['Fiber optic'],
    'OnlineSecurity': ['No'],
    'OnlineBackup': ['No'],
    'DeviceProtection': ['No'],
    'TechSupport': ['No'],
    'StreamingTV': ['Yes'],
    'StreamingMovies': ['Yes'],
    'Contract': ['Month-to-month'],
    'PaperlessBilling': ['Yes'],
    'PaymentMethod': ['Electronic check'],
    'MonthlyCharges': [89.5],
    'TotalCharges': [1074.0]
})

# 038 예측
prediction = predict_model(loaded_model, data=new_customer)
print(f"\n이탈 예측: {'Yes' if prediction['prediction_label'].values[0] == 'Yes' else 'No'}")
print(f"이탈 확률: {prediction['prediction_score'].values[0]:.2%}")

정리

고객 이탈은 대표적인 불균형 분류 문제
Recall 중시 (이탈 고객 놓치면 안 됨)
비즈니스 임계값 조정으로 Trade-off 관리
특성 중요도와 SHAP으로 이탈 원인 파악
고위험 고객 세그먼트로 타겟 마케팅

다음 글 예고

다음 글에서는 분류 실전 - 스팸 메일 분류를 다룹니다.

PyCaret 머신러닝 마스터 시리즈 #038

개요​

실습 환경​

프로젝트 개요​

데이터 로드 및 탐색​

데이터 전처리​

탐색적 데이터 분석​

PyCaret 설정​

모델 비교​

최고 모델 튜닝​

앙상블​

모델 평가​

모델 해석 (SHAP)​

예측 결과 분석​

비즈니스 임계값 설정​

고위험 고객 세그먼트​

최종 모델 저장​

새 고객 예측​

정리​

다음 글 예고​

개요

실습 환경

프로젝트 개요

데이터 로드 및 탐색

데이터 전처리

탐색적 데이터 분석

PyCaret 설정

모델 비교

최고 모델 튜닝

앙상블

모델 평가

모델 해석 (SHAP)

예측 결과 분석

비즈니스 임계값 설정

고위험 고객 세그먼트

최종 모델 저장

새 고객 예측

정리

다음 글 예고