081 종합 프로젝트 - 캐글 대회 도전 (1) 데이터 탐색

키워드: 캐글, Kaggle, EDA, 데이터 탐색

개요

캐글(Kaggle)은 세계 최대의 데이터 사이언스 경진대회 플랫폼입니다. 이번 시리즈(081-083)에서는 FLAML을 활용하여 실제 캐글 대회 스타일의 문제를 해결해봅니다. 첫 번째 글에서는 데이터 탐색(EDA)을 진행합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl]

pip install flaml[automl] pandas numpy matplotlib seaborn

프로젝트 개요: 고객 이탈 예측

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 081 시드 설정
np.random.seed(42)

# 081 캐글 스타일 데이터 생성 (고객 이탈 예측)
n_samples = 10000

data = {
    # 고객 정보
    'customer_id': range(1, n_samples + 1),
    'age': np.random.randint(18, 70, n_samples),
    'gender': np.random.choice(['M', 'F'], n_samples),
    'tenure': np.random.randint(1, 72, n_samples),  # 가입 개월 수

    # 서비스 이용 정보
    'monthly_charges': np.random.uniform(20, 100, n_samples),
    'total_charges': np.random.uniform(100, 5000, n_samples),
    'num_products': np.random.randint(1, 5, n_samples),
    'has_phone_service': np.random.choice([0, 1], n_samples, p=[0.1, 0.9]),
    'has_internet_service': np.random.choice([0, 1], n_samples, p=[0.2, 0.8]),
    'has_streaming': np.random.choice([0, 1], n_samples, p=[0.4, 0.6]),

    # 고객 행동
    'num_support_tickets': np.random.poisson(2, n_samples),
    'avg_monthly_usage': np.random.uniform(10, 500, n_samples),
    'contract_type': np.random.choice(['month', 'year', '2year'], n_samples, p=[0.5, 0.3, 0.2]),
    'payment_method': np.random.choice(['credit', 'bank', 'electronic'], n_samples),
}

df = pd.DataFrame(data)

# 081 타겟 생성 (이탈 여부)
churn_prob = 0.1 + 0.3 * (df['tenure'] < 12).astype(int) + \
             0.2 * (df['num_support_tickets'] > 3).astype(int) + \
             0.1 * (df['contract_type'] == 'month').astype(int) - \
             0.1 * (df['monthly_charges'] < 50).astype(int)
churn_prob = np.clip(churn_prob, 0, 1)
df['churn'] = np.random.binomial(1, churn_prob)

print("=== 고객 이탈 예측 데이터셋 ===")
print(f"데이터 크기: {df.shape}")
print(f"\n처음 5행:")
print(df.head())

기본 데이터 정보

# 081 데이터 타입 및 결측치 확인
print("\n=== 데이터 정보 ===")
print(df.info())

print("\n=== 기초 통계량 ===")
print(df.describe())

# 081 결측치 확인
print("\n=== 결측치 현황 ===")
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "결측치 없음")

# 081 타겟 분포
print("\n=== 타겟(이탈) 분포 ===")
print(df['churn'].value_counts())
print(f"\n이탈률: {df['churn'].mean() * 100:.2f}%")

수치형 변수 탐색

# 081 수치형 변수 목록
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('customer_id')
numeric_cols.remove('churn')

print(f"수치형 변수: {numeric_cols}")

# 081 수치형 변수 분포 시각화
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for i, col in enumerate(numeric_cols[:9]):
    ax = axes[i]
    df[col].hist(bins=30, ax=ax, edgecolor='black', alpha=0.7)
    ax.set_title(f'{col} 분포')
    ax.set_xlabel(col)
    ax.set_ylabel('빈도')

plt.tight_layout()
plt.show()

# 081 수치형 변수 통계
print("\n수치형 변수 통계:")
print(df[numeric_cols].describe().T)

범주형 변수 탐색

# 081 범주형 변수 목록
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"범주형 변수: {categorical_cols}")

# 081 범주형 변수 분포
fig, axes = plt.subplots(1, len(categorical_cols), figsize=(15, 4))

for i, col in enumerate(categorical_cols):
    ax = axes[i] if len(categorical_cols) > 1 else axes
    value_counts = df[col].value_counts()
    value_counts.plot(kind='bar', ax=ax, edgecolor='black', alpha=0.7)
    ax.set_title(f'{col} 분포')
    ax.set_xlabel(col)
    ax.set_ylabel('빈도')
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# 081 범주별 빈도
print("\n범주형 변수 분포:")
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())

타겟과 변수 관계 분석

# 081 타겟(이탈)과 수치형 변수 관계
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(['tenure', 'monthly_charges', 'total_charges',
                         'num_products', 'num_support_tickets', 'avg_monthly_usage']):
    ax = axes[i]
    df.groupby('churn')[col].mean().plot(kind='bar', ax=ax, color=['blue', 'red'])
    ax.set_title(f'이탈 여부별 {col} 평균')
    ax.set_xlabel('이탈 (0=유지, 1=이탈)')
    ax.set_ylabel(f'평균 {col}')
    ax.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

# 081 수치로 확인
print("\n이탈 여부별 수치형 변수 평균:")
print(df.groupby('churn')[numeric_cols].mean().T)

범주형 변수와 이탈 관계

# 081 범주형 변수별 이탈률
fig, axes = plt.subplots(1, len(categorical_cols), figsize=(15, 4))

for i, col in enumerate(categorical_cols):
    ax = axes[i] if len(categorical_cols) > 1 else axes
    churn_rate = df.groupby(col)['churn'].mean()
    churn_rate.plot(kind='bar', ax=ax, color='coral', edgecolor='black')
    ax.set_title(f'{col}별 이탈률')
    ax.set_xlabel(col)
    ax.set_ylabel('이탈률')
    ax.axhline(y=df['churn'].mean(), color='red', linestyle='--', label='전체 평균')
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# 081 범주별 이탈률 수치
print("\n범주형 변수별 이탈률:")
for col in categorical_cols:
    print(f"\n{col}별 이탈률:")
    print(df.groupby(col)['churn'].agg(['count', 'sum', 'mean']))

상관관계 분석

# 081 수치형 변수 상관관계
correlation_cols = numeric_cols + ['churn']
correlation = df[correlation_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation, annot=True, cmap='RdBu_r', center=0,
            fmt='.2f', square=True, linewidths=0.5)
plt.title('변수 간 상관관계')
plt.tight_layout()
plt.show()

# 081 타겟과 상관관계가 높은 변수
print("\n이탈(churn)과의 상관관계:")
churn_corr = correlation['churn'].drop('churn').sort_values(key=abs, ascending=False)
print(churn_corr)

이상치 탐지

def detect_outliers(df, column, method='iqr'):
    """이상치 탐지"""
    if method == 'iqr':
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        outliers = df[(df[column] < lower) | (df[column] > upper)]
    else:  # z-score
        mean = df[column].mean()
        std = df[column].std()
        z_scores = np.abs((df[column] - mean) / std)
        outliers = df[z_scores > 3]

    return len(outliers), len(outliers) / len(df) * 100

print("=== 이상치 분석 (IQR 방법) ===")
for col in numeric_cols:
    count, percent = detect_outliers(df, col)
    if percent > 1:
        print(f"{col}: {count}개 ({percent:.2f}%)")

# 081 박스플롯으로 시각화
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(numeric_cols[:6]):
    ax = axes[i]
    df.boxplot(column=col, by='churn', ax=ax)
    ax.set_title(f'{col} 이탈별 분포')

plt.tight_layout()
plt.show()

특성 중요도 초기 확인

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# 081 데이터 준비
df_encoded = df.copy()
for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])

feature_cols = [c for c in df_encoded.columns if c not in ['customer_id', 'churn']]
X = df_encoded[feature_cols]
y = df_encoded['churn']

# 081 Random Forest로 특성 중요도 확인
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X, y)

# 081 특성 중요도
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\n=== 특성 중요도 (Random Forest) ===")
print(importance.to_string(index=False))

# 081 시각화
plt.figure(figsize=(10, 8))
plt.barh(importance['feature'], importance['importance'], color='steelblue')
plt.xlabel('중요도')
plt.title('특성 중요도 (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

EDA 인사이트 정리

insights = {
    '발견 사항': [
        '이탈률',
        '가입 기간(tenure)',
        '고객 지원 티켓',
        '계약 유형',
        '월 요금'
    ],
    '내용': [
        f"전체 이탈률 {df['churn'].mean()*100:.1f}%",
        '가입 기간이 짧을수록 이탈률 높음',
        '지원 티켓 많으면 이탈 위험 증가',
        '월간 계약자가 이탈률 가장 높음',
        '고요금 고객이 상대적으로 이탈 낮음'
    ],
    '모델링 시사점': [
        '클래스 불균형 처리 필요',
        '중요 특성으로 활용',
        '고객 불만 지표로 활용',
        '계약 갱신 유도 전략',
        '가격 정책 검토'
    ]
}

print("\n=== EDA 인사이트 정리 ===")
print(pd.DataFrame(insights).to_string(index=False))

다음 단계 준비

# 081 전처리된 데이터 저장
df_encoded.to_csv('churn_preprocessed.csv', index=False)

print("\n=== 다음 단계 (모델링) 준비 사항 ===")
prep_tasks = [
    "1. 특성 엔지니어링 (파생 변수 생성)",
    "2. 범주형 인코딩 (One-hot 또는 Label)",
    "3. 수치형 스케일링 (필요시)",
    "4. 클래스 불균형 처리",
    "5. 학습/검증 데이터 분할",
    "6. FLAML AutoML 적용"
]

for task in prep_tasks:
    print(f"  {task}")

print(f"\n전처리 데이터 저장: churn_preprocessed.csv")
print(f"다음 글에서 FLAML을 사용한 모델링을 진행합니다.")

정리

데이터셋: 10,000명 고객의 이탈 예측 데이터
이탈률: 약 27% (클래스 불균형)
주요 특성: tenure, num_support_tickets, contract_type
EDA: 분포, 상관관계, 이상치 분석 완료
다음 글에서 FLAML로 모델링 진행

다음 글 예고

다음 글에서는 캐글 대회 도전 (2) 모델링을 진행합니다. FLAML AutoML을 활용하여 이탈 예측 모델을 구축합니다.

FLAML AutoML 마스터 시리즈 #081

개요​

실습 환경​

프로젝트 개요: 고객 이탈 예측​

기본 데이터 정보​

수치형 변수 탐색​

범주형 변수 탐색​

타겟과 변수 관계 분석​

범주형 변수와 이탈 관계​

상관관계 분석​

이상치 탐지​

특성 중요도 초기 확인​

EDA 인사이트 정리​

다음 단계 준비​

정리​

다음 글 예고​

개요