088 불균형 데이터 처리

키워드: 불균형 데이터, imbalanced

개요

불균형 데이터(Imbalanced Data)는 클래스 간 샘플 수 차이가 큰 데이터입니다. 사기 탐지, 질병 진단 등 실제 문제에서 흔히 발생하며, 적절한 처리 없이는 모델 성능이 저하됩니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pycaret[full]>=3.0, imbalanced-learn

불균형 데이터의 문제

불균형 예시:
- 사기 거래: 0.1%
- 희귀 질병: 1%
- 제조 불량: 2%

문제점:
- 다수 클래스에 편향된 학습
- 소수 클래스 예측 성능 저하
- 정확도 높아도 실용성 없음

예: 사기 0.1%
- 모든 걸 정상으로 예측해도 99.9% 정확도
- 하지만 사기 탐지 목적 달성 실패

PyCaret에서 불균형 처리

from pycaret.classification import *
import pandas as pd
import numpy as np

# 088 불균형 데이터 생성
np.random.seed(42)
n_majority = 950
n_minority = 50

majority = pd.DataFrame({
    'feature1': np.random.randn(n_majority),
    'feature2': np.random.randn(n_majority),
    'feature3': np.random.randn(n_majority),
    'target': 0
})

minority = pd.DataFrame({
    'feature1': np.random.randn(n_minority) + 2,
    'feature2': np.random.randn(n_minority) + 1,
    'feature3': np.random.randn(n_minority) + 1.5,
    'target': 1
})

data = pd.concat([majority, minority], ignore_index=True)
print(f"클래스 분포:\n{data['target'].value_counts()}")
print(f"불균형 비율: 1:{n_majority//n_minority}")

fix_imbalance 옵션

from pycaret.classification import *

# 088 불균형 처리 없이
clf_no_fix = setup(data, target='target', session_id=42, verbose=False)
rf_no_fix = create_model('rf')

# 088 SMOTE로 불균형 처리
clf_fix = setup(
    data,
    target='target',
    fix_imbalance=True,  # 불균형 처리 활성화
    session_id=42,
    verbose=False
)
rf_fix = create_model('rf')

오버샘플링 (Oversampling)

SMOTE

from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd
import numpy as np

X = data.drop('target', axis=1)
y = data['target']

print(f"원본 클래스 분포: {Counter(y)}")

# 088 SMOTE 적용
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print(f"SMOTE 후 분포: {Counter(y_resampled)}")

SMOTE 변형

from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN

# 088 기본 SMOTE
smote = SMOTE(random_state=42)

# 088 Borderline SMOTE: 경계 샘플 중심
borderline = BorderlineSMOTE(random_state=42)

# 088 ADASYN: 학습 어려운 샘플 중심
adasyn = ADASYN(random_state=42)

# 088 적용 및 비교
for name, sampler in [('SMOTE', smote), ('Borderline', borderline), ('ADASYN', adasyn)]:
    try:
        X_res, y_res = sampler.fit_resample(X, y)
        print(f"{name}: {Counter(y_res)}")
    except Exception as e:
        print(f"{name} 오류: {e}")

언더샘플링 (Undersampling)

from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss

# 088 랜덤 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)
print(f"랜덤 언더샘플링: {Counter(y_rus)}")

# 088 Tomek Links: 경계 다수 클래스 제거
tomek = TomekLinks()
X_tomek, y_tomek = tomek.fit_resample(X, y)
print(f"Tomek Links: {Counter(y_tomek)}")

# 088 NearMiss: 정보량 기반
nearmiss = NearMiss(version=1)
X_nm, y_nm = nearmiss.fit_resample(X, y)
print(f"NearMiss: {Counter(y_nm)}")

복합 방법

from imblearn.combine import SMOTEENN, SMOTETomek

# 088 SMOTE + ENN (오버샘플링 + 클리닝)
smote_enn = SMOTEENN(random_state=42)
X_se, y_se = smote_enn.fit_resample(X, y)
print(f"SMOTE+ENN: {Counter(y_se)}")

# 088 SMOTE + Tomek Links
smote_tomek = SMOTETomek(random_state=42)
X_st, y_st = smote_tomek.fit_resample(X, y)
print(f"SMOTE+Tomek: {Counter(y_st)}")

클래스 가중치 (Class Weight)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# 088 가중치 없이
rf_no_weight = RandomForestClassifier(random_state=42)
scores_no = cross_val_score(rf_no_weight, X, y, cv=5, scoring='f1')
print(f"가중치 없음 - F1: {scores_no.mean():.4f}")

# 088 균형 가중치
rf_balanced = RandomForestClassifier(class_weight='balanced', random_state=42)
scores_balanced = cross_val_score(rf_balanced, X, y, cv=5, scoring='f1')
print(f"balanced 가중치 - F1: {scores_balanced.mean():.4f}")

# 088 커스텀 가중치
rf_custom = RandomForestClassifier(class_weight={0: 1, 1: 10}, random_state=42)
scores_custom = cross_val_score(rf_custom, X, y, cv=5, scoring='f1')
print(f"커스텀 가중치 - F1: {scores_custom.mean():.4f}")

평가 지표 선택

from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, confusion_matrix,
                            classification_report)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# 088 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 088 모델 학습
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

# 088 다양한 지표
print("=== 분류 보고서 ===")
print(classification_report(y_test, y_pred))

print("\n=== 개별 지표 ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1: {f1_score(y_test, y_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")

print("\n=== 혼동 행렬 ===")
print(confusion_matrix(y_test, y_pred))

지표 선택 가이드

상황별 권장 지표:

사기 탐지 (FN 비용 높음):
- Recall 중요
- F2 Score (Recall 가중)

스팸 필터 (FP 비용 높음):
- Precision 중요
- F0.5 Score (Precision 가중)

균형 잡힌 평가:
- F1 Score
- AUC-ROC

정확도는 불균형에서 무의미!

임계값 조정

import numpy as np
from sklearn.metrics import precision_recall_curve, f1_score
import matplotlib.pyplot as plt

# 088 확률 예측
y_prob = rf.predict_proba(X_test)[:, 1]

# 088 다양한 임계값에서 성능
thresholds = np.arange(0.1, 0.9, 0.05)
f1_scores = []
precisions = []
recalls = []

for thresh in thresholds:
    y_pred_thresh = (y_prob >= thresh).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_thresh))
    precisions.append(precision_score(y_test, y_pred_thresh))
    recalls.append(recall_score(y_test, y_pred_thresh))

# 088 최적 임계값
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"최적 임계값: {best_threshold:.2f}")
print(f"최고 F1: {f1_scores[best_idx]:.4f}")

# 088 시각화
plt.figure(figsize=(10, 6))
plt.plot(thresholds, f1_scores, 'b-', label='F1')
plt.plot(thresholds, precisions, 'g--', label='Precision')
plt.plot(thresholds, recalls, 'r--', label='Recall')
plt.axvline(x=best_threshold, color='black', linestyle=':', label=f'Best: {best_threshold:.2f}')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Threshold Optimization')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('threshold_optimization.png', dpi=150)

PyCaret 종합 예제

from pycaret.classification import *
import pandas as pd

# 088 불균형 처리 비교
methods = {
    'No Fix': {'fix_imbalance': False},
    'SMOTE': {'fix_imbalance': True}
}

results = []

for name, params in methods.items():
    clf = setup(data, target='target', **params, session_id=42, verbose=False)
    rf = create_model('rf')

    # 메트릭 추출 (실제로는 create_model 출력에서)
    results.append({
        'Method': name,
        'Model': 'Random Forest'
    })

# 088 여러 모델 비교
clf = setup(data, target='target', fix_imbalance=True, session_id=42, verbose=False)
best = compare_models(sort='F1')  # F1 기준 정렬

방법 선택 가이드

데이터 크기:
- 대용량: 언더샘플링
- 소량: 오버샘플링 (SMOTE)

소수 클래스 크기:
- 매우 적음: ADASYN, Borderline SMOTE
- 어느 정도 있음: 기본 SMOTE

모델 유형:
- 트리 기반: class_weight도 효과적
- SVM/신경망: 샘플링 권장

실시간 예측:
- 학습 시만 샘플링
- 예측 시 원본 데이터

정리

불균형 데이터: 클래스 비율 차이가 큰 데이터
SMOTE: 소수 클래스 합성 (오버샘플링)
언더샘플링: 다수 클래스 축소
class_weight: 학습 시 가중치 부여
F1, AUC-ROC 등 적절한 지표 사용
임계값 조정으로 추가 최적화

다음 글 예고

다음 글에서는 파이프라인과 전처리를 다룹니다.

PyCaret 머신러닝 마스터 시리즈 #088

개요​

실습 환경​

불균형 데이터의 문제​

PyCaret에서 불균형 처리​

fix_imbalance 옵션​

오버샘플링 (Oversampling)​

SMOTE​

SMOTE 변형​

언더샘플링 (Undersampling)​

복합 방법​

클래스 가중치 (Class Weight)​

평가 지표 선택​

지표 선택 가이드​

임계값 조정​

PyCaret 종합 예제​

방법 선택 가이드​

정리​

다음 글 예고​

개요

실습 환경

불균형 데이터의 문제

PyCaret에서 불균형 처리

fix_imbalance 옵션

오버샘플링 (Oversampling)

SMOTE

SMOTE 변형

언더샘플링 (Undersampling)

복합 방법

클래스 가중치 (Class Weight)

평가 지표 선택

지표 선택 가이드

임계값 조정

PyCaret 종합 예제

방법 선택 가이드

정리

다음 글 예고