082 종합 프로젝트 - 캐글 대회 도전 (2) 모델링

키워드: 캐글, Kaggle, 모델링, FLAML AutoML

개요

이전 글에서 탐색한 고객 이탈 데이터를 바탕으로, FLAML AutoML을 활용하여 예측 모델을 구축합니다. 특성 엔지니어링부터 모델 학습, 평가까지 전체 모델링 파이프라인을 진행합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl]

pip install flaml[automl] pandas numpy scikit-learn

데이터 로드 및 준비

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from flaml import AutoML

# 082 이전 글의 데이터 재생성
np.random.seed(42)
n_samples = 10000

data = {
    'customer_id': range(1, n_samples + 1),
    'age': np.random.randint(18, 70, n_samples),
    'gender': np.random.choice(['M', 'F'], n_samples),
    'tenure': np.random.randint(1, 72, n_samples),
    'monthly_charges': np.random.uniform(20, 100, n_samples),
    'total_charges': np.random.uniform(100, 5000, n_samples),
    'num_products': np.random.randint(1, 5, n_samples),
    'has_phone_service': np.random.choice([0, 1], n_samples, p=[0.1, 0.9]),
    'has_internet_service': np.random.choice([0, 1], n_samples, p=[0.2, 0.8]),
    'has_streaming': np.random.choice([0, 1], n_samples, p=[0.4, 0.6]),
    'num_support_tickets': np.random.poisson(2, n_samples),
    'avg_monthly_usage': np.random.uniform(10, 500, n_samples),
    'contract_type': np.random.choice(['month', 'year', '2year'], n_samples, p=[0.5, 0.3, 0.2]),
    'payment_method': np.random.choice(['credit', 'bank', 'electronic'], n_samples),
}

df = pd.DataFrame(data)

# 082 타겟 생성
churn_prob = 0.1 + 0.3 * (df['tenure'] < 12).astype(int) + \
             0.2 * (df['num_support_tickets'] > 3).astype(int) + \
             0.1 * (df['contract_type'] == 'month').astype(int) - \
             0.1 * (df['monthly_charges'] < 50).astype(int)
churn_prob = np.clip(churn_prob, 0, 1)
df['churn'] = np.random.binomial(1, churn_prob)

print(f"데이터 로드 완료: {df.shape}")
print(f"이탈률: {df['churn'].mean() * 100:.2f}%")

특성 엔지니어링

# 1. 파생 변수 생성
df['avg_charge_per_month'] = df['total_charges'] / (df['tenure'] + 1)
df['service_count'] = df['has_phone_service'] + df['has_internet_service'] + df['has_streaming']
df['ticket_per_tenure'] = df['num_support_tickets'] / (df['tenure'] + 1)
df['charge_per_product'] = df['monthly_charges'] / (df['num_products'] + 0.1)
df['is_new_customer'] = (df['tenure'] < 12).astype(int)
df['is_high_spender'] = (df['monthly_charges'] > df['monthly_charges'].median()).astype(int)

# 2. 연령대 그룹화
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 50, 100],
                         labels=['young', 'adult', 'middle', 'senior'])

# 3. 가입 기간 그룹화
df['tenure_group'] = pd.cut(df['tenure'], bins=[0, 12, 24, 48, 100],
                            labels=['0-1y', '1-2y', '2-4y', '4y+'])

print("생성된 파생 변수:")
new_cols = ['avg_charge_per_month', 'service_count', 'ticket_per_tenure',
            'charge_per_product', 'is_new_customer', 'is_high_spender',
            'age_group', 'tenure_group']
for col in new_cols:
    print(f"  - {col}")

print(f"\n특성 엔지니어링 후 데이터: {df.shape}")

인코딩 및 전처리

# 082 범주형 변수 목록
categorical_cols = ['gender', 'contract_type', 'payment_method', 'age_group', 'tenure_group']

# 082 Label Encoding
df_encoded = df.copy()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    label_encoders[col] = le
    print(f"{col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# 082 특성과 타겟 분리
feature_cols = [c for c in df_encoded.columns if c not in ['customer_id', 'churn']]
X = df_encoded[feature_cols]
y = df_encoded['churn']

print(f"\n특성 수: {len(feature_cols)}")
print(f"특성 목록: {feature_cols}")

데이터 분할

# 082 학습/검증/테스트 분할
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

print("데이터 분할 완료:")
print(f"  학습: {X_train.shape} ({y_train.mean()*100:.1f}% 이탈)")
print(f"  검증: {X_val.shape} ({y_val.mean()*100:.1f}% 이탈)")
print(f"  테스트: {X_test.shape} ({y_test.mean()*100:.1f}% 이탈)")

FLAML AutoML 학습

# 082 FLAML AutoML 설정 및 학습
automl = AutoML()

automl.fit(
    X_train, y_train,
    task="classification",
    metric="roc_auc",           # AUC 최적화
    time_budget=120,            # 2분 탐색
    estimator_list=['lgbm', 'xgboost', 'rf', 'extra_tree'],
    n_jobs=-1,
    seed=42,
    verbose=2
)

print("\n=== FLAML AutoML 결과 ===")
print(f"최적 모델: {automl.best_estimator}")
print(f"최적 설정: {automl.best_config}")
print(f"탐색 횟수: {len(automl.config_history)}")

모델 평가

from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, classification_report,
                             confusion_matrix, roc_curve)
import matplotlib.pyplot as plt

# 082 예측
y_train_pred = automl.predict(X_train)
y_val_pred = automl.predict(X_val)
y_test_pred = automl.predict(X_test)

y_train_proba = automl.predict_proba(X_train)[:, 1]
y_val_proba = automl.predict_proba(X_val)[:, 1]
y_test_proba = automl.predict_proba(X_test)[:, 1]

# 082 평가 함수
def evaluate_model(y_true, y_pred, y_proba, name=""):
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'ROC AUC': roc_auc_score(y_true, y_proba)
    }
    print(f"\n=== {name} 성능 ===")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")
    return metrics

# 082 각 데이터셋 평가
train_metrics = evaluate_model(y_train, y_train_pred, y_train_proba, "학습")
val_metrics = evaluate_model(y_val, y_val_pred, y_val_proba, "검증")
test_metrics = evaluate_model(y_test, y_test_pred, y_test_proba, "테스트")

혼동 행렬 시각화

# 082 혼동 행렬
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, (y_true, y_pred, title) in zip(axes,
    [(y_train, y_train_pred, '학습'),
     (y_val, y_val_pred, '검증'),
     (y_test, y_test_pred, '테스트')]):

    cm = confusion_matrix(y_true, y_pred)
    im = ax.imshow(cm, cmap='Blues')
    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(['유지', '이탈'])
    ax.set_yticklabels(['유지', '이탈'])
    ax.set_xlabel('예측')
    ax.set_ylabel('실제')
    ax.set_title(f'{title} 혼동 행렬')

    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', fontsize=14)

plt.tight_layout()
plt.show()

ROC 곡선

# 082 ROC 곡선 비교
plt.figure(figsize=(10, 8))

for y_true, y_proba, label in [
    (y_train, y_train_proba, f'학습 (AUC={train_metrics["ROC AUC"]:.4f})'),
    (y_val, y_val_proba, f'검증 (AUC={val_metrics["ROC AUC"]:.4f})'),
    (y_test, y_test_proba, f'테스트 (AUC={test_metrics["ROC AUC"]:.4f})')
]:
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    plt.plot(fpr, tpr, label=label)

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC 곡선 비교')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

특성 중요도

# 082 모델의 특성 중요도 추출
if hasattr(automl.model.estimator, 'feature_importances_'):
    importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': automl.model.estimator.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\n=== 특성 중요도 (Top 10) ===")
    print(importance.head(10).to_string(index=False))

    # 시각화
    plt.figure(figsize=(10, 8))
    plt.barh(importance['feature'][:15], importance['importance'][:15])
    plt.xlabel('중요도')
    plt.title(f'{automl.best_estimator} 특성 중요도')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

임계값 최적화

from sklearn.metrics import precision_recall_curve

# 082 Precision-Recall 곡선
precision, recall, thresholds = precision_recall_curve(y_val, y_val_proba)

# 082 F1 점수 기반 최적 임계값
f1_scores = 2 * precision * recall / (precision + recall + 1e-10)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5

print(f"\n=== 임계값 최적화 ===")
print(f"기본 임계값 (0.5): F1 = {val_metrics['F1 Score']:.4f}")
print(f"최적 임계값 ({best_threshold:.3f}): F1 = {f1_scores[best_idx]:.4f}")

# 082 최적 임계값으로 재평가
y_val_pred_opt = (y_val_proba >= best_threshold).astype(int)
y_test_pred_opt = (y_test_proba >= best_threshold).astype(int)

print("\n최적 임계값 적용 후 테스트 성능:")
print(f"  Precision: {precision_score(y_test, y_test_pred_opt):.4f}")
print(f"  Recall: {recall_score(y_test, y_test_pred_opt):.4f}")
print(f"  F1 Score: {f1_score(y_test, y_test_pred_opt):.4f}")

상세 분류 리포트

# 082 분류 리포트
print("\n=== 테스트 데이터 상세 분류 리포트 ===")
print(classification_report(y_test, y_test_pred, target_names=['유지', '이탈']))

# 082 비즈니스 관점 분석
print("\n=== 비즈니스 관점 분석 ===")
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
print(f"True Negative (정확히 유지 예측): {tn}")
print(f"False Positive (유지를 이탈로 예측): {fp}")
print(f"False Negative (이탈을 유지로 예측): {fn}")
print(f"True Positive (정확히 이탈 예측): {tp}")

# 082 비용 분석 (예시)
print("\n가정: 이탈 방지 비용 $50, 고객 획득 비용 $200")
cost_fp = fp * 50  # 불필요한 리텐션 비용
cost_fn = fn * 200  # 이탈 고객 재획득 비용
total_cost = cost_fp + cost_fn
print(f"False Positive 비용: ${cost_fp:,}")
print(f"False Negative 비용: ${cost_fn:,}")
print(f"총 비용: ${total_cost:,}")

모델 저장

import joblib

# 082 모델 저장
model_path = 'churn_automl_model.pkl'
joblib.dump(automl, model_path)
print(f"\n모델 저장: {model_path}")

# 082 특성 목록 저장
import json
with open('feature_columns.json', 'w') as f:
    json.dump(feature_cols, f)
print("특성 목록 저장: feature_columns.json")

# 082 최적 설정 저장
config = {
    'best_estimator': automl.best_estimator,
    'best_config': automl.best_config,
    'best_threshold': float(best_threshold),
    'test_metrics': {k: float(v) for k, v in test_metrics.items()}
}
with open('model_config.json', 'w') as f:
    json.dump(config, f, indent=2)
print("설정 저장: model_config.json")

모델링 결과 요약

summary = {
    '항목': ['최적 모델', 'ROC AUC', 'F1 Score', '정밀도', '재현율', '탐색 횟수'],
    '결과': [
        automl.best_estimator,
        f"{test_metrics['ROC AUC']:.4f}",
        f"{test_metrics['F1 Score']:.4f}",
        f"{test_metrics['Precision']:.4f}",
        f"{test_metrics['Recall']:.4f}",
        len(automl.config_history)
    ]
}

print("\n=== 모델링 결과 요약 ===")
print(pd.DataFrame(summary).to_string(index=False))

정리

특성 엔지니어링: 8개 파생 변수 생성
FLAML AutoML: 2분 학습으로 최적 모델 탐색
평가: ROC AUC, F1, Precision, Recall 분석
임계값 최적화: 비즈니스 목적에 맞게 조정
모델 저장: joblib으로 재사용 가능

다음 글 예고

다음 글에서는 캐글 대회 도전 (3) 튜닝을 진행합니다. 더 정교한 하이퍼파라미터 튜닝과 앙상블 기법을 적용합니다.

FLAML AutoML 마스터 시리즈 #082

개요​

실습 환경​

데이터 로드 및 준비​

특성 엔지니어링​

인코딩 및 전처리​

데이터 분할​

FLAML AutoML 학습​

모델 평가​

혼동 행렬 시각화​

ROC 곡선​

특성 중요도​

임계값 최적화​

상세 분류 리포트​

모델 저장​

모델링 결과 요약​

정리​

다음 글 예고​

개요

실습 환경

데이터 로드 및 준비

특성 엔지니어링

인코딩 및 전처리

데이터 분할

FLAML AutoML 학습

모델 평가

혼동 행렬 시각화

ROC 곡선

특성 중요도

임계값 최적화

상세 분류 리포트

모델 저장

모델링 결과 요약

정리

다음 글 예고