049 회귀 모델 해석

키워드: 모델 해석, SHAP, 부분 의존성

개요

회귀 모델의 예측을 이해하고 설명하는 것은 비즈니스 의사결정에 중요합니다. 이 글에서는 SHAP, 부분 의존성 플롯 등 다양한 해석 기법을 활용하여 회귀 모델을 분석합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], shap, scikit-learn

pip install flaml[automl] shap scikit-learn matplotlib

모델 준비

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from flaml import AutoML

# 049 데이터 준비
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 049 FLAML 학습
automl = AutoML()
automl.fit(
    X_train, y_train,
    task="regression",
    time_budget=60,
    metric="r2",
    verbose=0
)

print(f"최적 모델: {automl.best_estimator}")
print(f"R² Score: {automl.score(X_test, y_test):.4f}")

1. 특성 중요도

전통적 특성 중요도

# 049 트리 기반 모델의 특성 중요도
if hasattr(automl.best_model, 'feature_importances_'):
    importance = automl.best_model.feature_importances_
    feature_names = X.columns

    # 정렬
    sorted_idx = np.argsort(importance)[::-1]

    plt.figure(figsize=(10, 6))
    plt.bar(range(len(importance)), importance[sorted_idx])
    plt.xticks(range(len(importance)), feature_names[sorted_idx], rotation=45, ha='right')
    plt.xlabel('Feature')
    plt.ylabel('Importance')
    plt.title('Feature Importance (Traditional)')
    plt.tight_layout()
    plt.show()

    print("특성 중요도 (상위 5개):")
    for i in sorted_idx[:5]:
        print(f"  {feature_names[i]}: {importance[i]:.4f}")

순열 중요도

from sklearn.inspection import permutation_importance

# 049 순열 중요도 계산
perm_importance = permutation_importance(
    automl, X_test, y_test,
    n_repeats=10,
    random_state=42,
    n_jobs=-1
)

# 049 시각화
sorted_idx = perm_importance.importances_mean.argsort()[::-1]

plt.figure(figsize=(10, 6))
plt.boxplot([perm_importance.importances[i] for i in sorted_idx],
            vert=False, labels=X.columns[sorted_idx])
plt.xlabel('Importance')
plt.title('Permutation Importance')
plt.tight_layout()
plt.show()

print("\n순열 중요도 (상위 5개):")
for i in sorted_idx[:5]:
    print(f"  {X.columns[i]}: {perm_importance.importances_mean[i]:.4f} "
          f"(±{perm_importance.importances_std[i]:.4f})")

2. SHAP 분석

SHAP 값 계산

import shap

# 049 TreeExplainer (트리 모델에 효율적)
explainer = shap.TreeExplainer(automl.best_model)
shap_values = explainer.shap_values(X_test)

print("SHAP 값 shape:", shap_values.shape)
print("  → 각 샘플, 각 특성에 대한 기여도")

SHAP Summary Plot

# 049 Summary Plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_test, show=False)
plt.tight_layout()
plt.show()

print("SHAP Summary Plot 해석:")
print("  - 점의 색: 특성값 (빨강=높음, 파랑=낮음)")
print("  - X축: SHAP 값 (예측에 대한 기여)")
print("  - Y축: 특성 (중요도 순)")

SHAP Bar Plot

# 049 Bar Plot (평균 절대 SHAP 값)
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
plt.tight_layout()
plt.show()

개별 예측 해석

# 049 특정 샘플에 대한 해석
sample_idx = 0

print(f"\n샘플 {sample_idx} 예측 해석:")
print(f"  실제값: {y_test.iloc[sample_idx]:.4f}")
print(f"  예측값: {automl.predict(X_test.iloc[[sample_idx]])[0]:.4f}")

# 049 Force Plot
shap.initjs()
shap.force_plot(
    explainer.expected_value,
    shap_values[sample_idx],
    X_test.iloc[sample_idx],
    matplotlib=True,
    show=False
)
plt.tight_layout()
plt.show()

# 049 기여도 출력
print("\n특성별 기여도:")
contributions = pd.DataFrame({
    'Feature': X.columns,
    'Value': X_test.iloc[sample_idx].values,
    'SHAP': shap_values[sample_idx]
}).sort_values('SHAP', key=abs, ascending=False)
print(contributions.head(5).to_string(index=False))

SHAP Dependence Plot

# 049 특성별 SHAP 의존성
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

top_features = ['MedInc', 'AveOccup', 'Latitude', 'Longitude']

for ax, feature in zip(axes.flatten(), top_features):
    shap.dependence_plot(
        feature, shap_values, X_test,
        interaction_index=None,
        ax=ax, show=False
    )
    ax.set_title(f'SHAP Dependence: {feature}')

plt.tight_layout()
plt.show()

3. 부분 의존성 플롯 (PDP)

단일 특성 PDP

from sklearn.inspection import PartialDependenceDisplay

# 049 부분 의존성 플롯
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

features = [0, 1, 6, 7]  # MedInc, HouseAge, Latitude, Longitude
feature_names_list = [X.columns[i] for i in features]

for ax, feature in zip(axes.flatten(), features):
    PartialDependenceDisplay.from_estimator(
        automl, X_test, [feature],
        ax=ax, grid_resolution=50
    )
    ax.set_title(f'PDP: {X.columns[feature]}')

plt.tight_layout()
plt.show()

print("PDP 해석:")
print("  - Y축: 평균 예측값")
print("  - X축: 특성값")
print("  - 곡선: 해당 특성이 예측에 미치는 평균적 영향")

2차원 PDP

# 049 두 특성 간의 상호작용
fig, ax = plt.subplots(figsize=(10, 8))

PartialDependenceDisplay.from_estimator(
    automl, X_test, [(0, 6)],  # MedInc와 Latitude
    ax=ax, grid_resolution=30
)
ax.set_title('2D PDP: MedInc vs Latitude')

plt.tight_layout()
plt.show()

4. ICE 플롯 (Individual Conditional Expectation)

# 049 ICE 플롯 (개별 샘플의 조건부 기대)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 049 MedInc에 대한 ICE
PartialDependenceDisplay.from_estimator(
    automl, X_test.iloc[:100], [0],  # 100개 샘플만
    kind="both",  # PDP + ICE
    ax=axes[0],
    ice_lines_kw={"alpha": 0.3}
)
axes[0].set_title('ICE + PDP: MedInc')

# 049 AveOccup에 대한 ICE
PartialDependenceDisplay.from_estimator(
    automl, X_test.iloc[:100], [5],
    kind="both",
    ax=axes[1],
    ice_lines_kw={"alpha": 0.3}
)
axes[1].set_title('ICE + PDP: AveOccup')

plt.tight_layout()
plt.show()

print("ICE 플롯 해석:")
print("  - 얇은 선: 개별 샘플의 예측 변화")
print("  - 굵은 선: 평균 (PDP)")
print("  - 선들이 평행하면 상호작용 없음")

5. 종합 해석 보고서

def generate_interpretation_report(model, X_test, y_test, feature_names):
    """회귀 모델 해석 보고서 생성"""

    print("=" * 60)
    print("회귀 모델 해석 보고서")
    print("=" * 60)

    # 1. 모델 성능
    from sklearn.metrics import r2_score, mean_absolute_error
    y_pred = model.predict(X_test)

    print("\n1. 모델 성능")
    print(f"   R² Score: {r2_score(y_test, y_pred):.4f}")
    print(f"   MAE: {mean_absolute_error(y_test, y_pred):.4f}")

    # 2. SHAP 기반 특성 중요도
    explainer = shap.TreeExplainer(model.best_model)
    shap_values = explainer.shap_values(X_test)

    mean_shap = np.abs(shap_values).mean(axis=0)
    sorted_idx = np.argsort(mean_shap)[::-1]

    print("\n2. 특성 중요도 (SHAP 기반)")
    for i, idx in enumerate(sorted_idx[:5], 1):
        print(f"   {i}. {feature_names[idx]}: {mean_shap[idx]:.4f}")

    # 3. 특성 영향 방향
    print("\n3. 특성 영향 방향")
    for idx in sorted_idx[:5]:
        corr = np.corrcoef(X_test.iloc[:, idx], shap_values[:, idx])[0, 1]
        direction = "양의 상관" if corr > 0 else "음의 상관"
        print(f"   {feature_names[idx]}: {direction} ({corr:.3f})")

    # 4. 예측 범위
    print("\n4. 예측 범위")
    print(f"   최소: {y_pred.min():.4f}")
    print(f"   최대: {y_pred.max():.4f}")
    print(f"   평균: {y_pred.mean():.4f}")
    print(f"   표준편차: {y_pred.std():.4f}")

    # 5. 핵심 인사이트
    print("\n5. 핵심 인사이트")
    print(f"   - {feature_names[sorted_idx[0]]}이(가) 가장 영향력 있는 특성")
    print(f"   - 상위 3개 특성이 전체 영향의 "
          f"{mean_shap[sorted_idx[:3]].sum()/mean_shap.sum()*100:.1f}% 차지")

    print("=" * 60)

# 049 보고서 생성
generate_interpretation_report(automl, X_test, y_test, X.columns)

해석 방법 선택 가이드

guide = {
    '목적': ['전역 특성 중요도', '개별 예측 설명', '특성 효과 시각화', '상호작용 탐지'],
    '방법': ['SHAP Summary', 'SHAP Force Plot', 'PDP', '2D PDP / SHAP 의존성'],
    '장점': [
        '전체 그림 파악',
        '개별 예측 디버깅',
        '직관적인 해석',
        '복잡한 관계 발견'
    ]
}

print("\n해석 방법 선택 가이드:")
print(pd.DataFrame(guide).to_string(index=False))

정리

특성 중요도: 전통적, 순열, SHAP 기반 방법
SHAP: 이론적으로 일관된 해석, 개별 예측 설명 가능
PDP: 특성이 예측에 미치는 평균적 효과
ICE: 개별 샘플별 효과, 이질성 탐지
해석은 비즈니스 의사결정에 중요
여러 방법을 조합하여 종합적으로 이해

다음 글 예고

다음 글에서는 회귀 파트 총정리로 지금까지 배운 내용을 정리하겠습니다.

FLAML AutoML 마스터 시리즈 #049

개요​

실습 환경​

모델 준비​

1. 특성 중요도​

전통적 특성 중요도​

순열 중요도​

2. SHAP 분석​

SHAP 값 계산​

SHAP Summary Plot​

SHAP Bar Plot​

개별 예측 해석​

SHAP Dependence Plot​

3. 부분 의존성 플롯 (PDP)​

단일 특성 PDP​

2차원 PDP​

4. ICE 플롯 (Individual Conditional Expectation)​

5. 종합 해석 보고서​

해석 방법 선택 가이드​

정리​

다음 글 예고​

개요

실습 환경

모델 준비

1. 특성 중요도

전통적 특성 중요도

순열 중요도

2. SHAP 분석

SHAP 값 계산

SHAP Summary Plot

SHAP Bar Plot

개별 예측 해석

SHAP Dependence Plot

3. 부분 의존성 플롯 (PDP)

단일 특성 PDP

2차원 PDP

4. ICE 플롯 (Individual Conditional Expectation)

5. 종합 해석 보고서

해석 방법 선택 가이드

정리

다음 글 예고