021 ROC 곡선과 AUC 시각화

키워드: ROC, AUC, 시각화

개요

ROC 곡선은 분류 모델의 성능을 시각적으로 평가하는 강력한 도구입니다. 이 글에서는 ROC 곡선을 그리고, 해석하고, 여러 모델을 비교하는 방법을 상세히 알아봅니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], scikit-learn, matplotlib

pip install flaml[automl] scikit-learn matplotlib

ROC 곡선 복습

ROC 곡선이란?

ROC(Receiver Operating Characteristic) 곡선은 **임계값(threshold)**을 변화시키면서 TPR과 FPR의 관계를 그린 곡선입니다.

TPR (True Positive Rate) = TP / (TP + FN) = Recall
FPR (False Positive Rate) = FP / (FP + TN) = 1 - Specificity

임계값의 역할

import numpy as np

# 021 확률 예측값
y_prob = np.array([0.1, 0.4, 0.35, 0.8, 0.7, 0.9])
y_true = np.array([0, 0, 1, 1, 1, 1])

# 021 다양한 임계값으로 예측
for threshold in [0.3, 0.5, 0.7]:
    y_pred = (y_prob >= threshold).astype(int)
    print(f"임계값 {threshold}: 예측 = {y_pred}")

실행 결과

임계값 0.3: 예측 = [0 1 1 1 1 1]
임계값 0.5: 예측 = [0 0 0 1 1 1]
임계값 0.7: 예측 = [0 0 0 1 1 1]

ROC 곡선 그리기

기본 ROC 곡선

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from flaml import AutoML
import matplotlib.pyplot as plt

# 021 데이터 준비
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 021 FLAML 학습
automl = AutoML()
automl.fit(X_train, y_train, task="classification", time_budget=30, verbose=0)

# 021 확률 예측
y_prob = automl.predict_proba(X_test)[:, 1]

# 021 ROC 곡선 계산
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# 021 시각화
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='blue', lw=2,
         label=f'ROC Curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--',
         label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)', fontsize=12)
plt.ylabel('True Positive Rate (TPR)', fontsize=12)
plt.title('ROC Curve', fontsize=14)
plt.legend(loc='lower right', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('roc_curve_basic.png', dpi=100)
plt.show()

임계값 표시 추가

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, 'b-', lw=2, label=f'ROC (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')

# 021 주요 임계값 포인트 표시
for i, threshold in enumerate(thresholds):
    if threshold in [0.3, 0.5, 0.7, 0.9]:
        idx = np.where(thresholds == threshold)[0]
        if len(idx) > 0:
            plt.scatter(fpr[idx], tpr[idx], s=100, zorder=5)
            plt.annotate(f't={threshold:.1f}', (fpr[idx[0]], tpr[idx[0]]),
                        textcoords="offset points", xytext=(10, -10))

plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve with Thresholds')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

여러 모델 ROC 비교

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# 021 여러 모델 정의
models = {
    'FLAML AutoML': automl,
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
}

# 021 sklearn 모델 학습
for name, model in models.items():
    if name != 'FLAML AutoML':
        model.fit(X_train, y_train)

# 021 ROC 곡선 비교
plt.figure(figsize=(10, 8))
colors = ['blue', 'green', 'red', 'purple']

for (name, model), color in zip(models.items(), colors):
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = model.decision_function(X_test)

    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, color=color, lw=2,
             label=f'{name} (AUC = {roc_auc:.4f})')

plt.plot([0, 1], [0, 1], 'k--', lw=1, label='Random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve Comparison', fontsize=14)
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('roc_comparison.png', dpi=100)
plt.show()

AUC 계산 방법

sklearn 사용

from sklearn.metrics import roc_auc_score

# 021 방법 1: roc_auc_score 직접 사용
auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC (roc_auc_score): {auc_score:.4f}")

# 021 방법 2: roc_curve + auc 조합
fpr, tpr, _ = roc_curve(y_test, y_prob)
auc_score2 = auc(fpr, tpr)
print(f"AUC (auc 함수): {auc_score2:.4f}")

다중 분류 AUC

from sklearn.datasets import load_iris
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

# 021 데이터 (3클래스)
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 021 FLAML 학습
automl_multi = AutoML()
automl_multi.fit(X_train, y_train, task="classification", time_budget=30, verbose=0)

# 021 확률 예측
y_prob_multi = automl_multi.predict_proba(X_test)

# 021 다중 분류 AUC
# 021 One-vs-Rest (OvR)
auc_ovr = roc_auc_score(y_test, y_prob_multi, multi_class='ovr')
print(f"AUC (OvR): {auc_ovr:.4f}")

# 021 One-vs-One (OvO)
auc_ovo = roc_auc_score(y_test, y_prob_multi, multi_class='ovo')
print(f"AUC (OvO): {auc_ovo:.4f}")

최적 임계값 찾기

Youden's J statistic

# 021 Youden's J = TPR - FPR 를 최대화하는 임계값
j_scores = tpr - fpr
optimal_idx = np.argmax(j_scores)
optimal_threshold = thresholds[optimal_idx]

print(f"최적 임계값 (Youden's J): {optimal_threshold:.4f}")
print(f"해당 TPR: {tpr[optimal_idx]:.4f}")
print(f"해당 FPR: {fpr[optimal_idx]:.4f}")

# 021 시각화에 최적점 표시
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, 'b-', lw=2, label=f'ROC (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.scatter(fpr[optimal_idx], tpr[optimal_idx],
            marker='o', color='red', s=200, zorder=5,
            label=f'Optimal (threshold={optimal_threshold:.3f})')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve with Optimal Threshold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

ROC 곡선 함수화

def plot_roc_curves(models_dict, X_test, y_test, figsize=(10, 8)):
    """
    여러 모델의 ROC 곡선을 비교하는 함수

    Parameters:
    - models_dict: {'모델명': 모델객체} 딕셔너리
    - X_test, y_test: 테스트 데이터
    """
    plt.figure(figsize=figsize)

    results = {}

    for name, model in models_dict.items():
        if hasattr(model, 'predict_proba'):
            y_prob = model.predict_proba(X_test)[:, 1]
        elif hasattr(model, 'decision_function'):
            y_prob = model.decision_function(X_test)
        else:
            continue

        fpr, tpr, thresholds = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC={roc_auc:.4f})')

        results[name] = {
            'auc': roc_auc,
            'fpr': fpr,
            'tpr': tpr,
            'thresholds': thresholds
        }

    plt.plot([0, 1], [0, 1], 'k--', lw=1, label='Random')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve Comparison')
    plt.legend(loc='lower right')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()

    return results

# 021 사용 예
results = plot_roc_curves({'FLAML': automl}, X_test, y_test)
plt.show()

정리

ROC 곡선은 임계값 변화에 따른 TPR과 FPR의 관계를 보여줍니다.
AUC는 ROC 곡선 아래 면적으로, 1에 가까울수록 좋습니다.
여러 모델의 ROC 곡선을 비교하여 최적 모델을 선택할 수 있습니다.
최적 임계값은 Youden's J statistic으로 찾을 수 있습니다.
다중 분류에서는 OvR 또는 OvO 방식으로 AUC를 계산합니다.

다음 글 예고

다음 글에서는 다중 분류 - 붓꽃 품종 예측에 대해 알아보겠습니다. 3개 이상의 클래스를 분류하는 다중 분류 프로젝트를 진행합니다.

FLAML AutoML 마스터 시리즈 #021

개요​

실습 환경​

ROC 곡선 복습​

ROC 곡선이란?​

임계값의 역할​

실행 결과​

ROC 곡선 그리기​

기본 ROC 곡선​

임계값 표시 추가​

여러 모델 ROC 비교​

AUC 계산 방법​

sklearn 사용​

다중 분류 AUC​

최적 임계값 찾기​

Youden's J statistic​

ROC 곡선 함수화​

정리​

다음 글 예고​

개요