085 모델 해석 - Feature Importance

키워드: Feature Importance, 특성 중요도

개요

Feature Importance(특성 중요도)는 모델 예측에 각 특성이 얼마나 기여하는지를 나타내는 지표입니다. 모델 해석, 특성 선택, 도메인 이해에 필수적인 기법입니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pycaret[full]>=3.0

특성 중요도의 종류

1. 모델 내장 중요도 (Built-in)
   - 트리 모델의 불순도 감소량
   - 선형 모델의 계수

2. 순열 중요도 (Permutation)
   - 특성 값을 섞었을 때 성능 감소량
   - 모델에 구애받지 않음

3. SHAP 중요도
   - 평균 절대 SHAP 값
   - 이론적 기반

4. Drop-Column 중요도
   - 특성 제거 후 성능 변화
   - 계산 비용 높음

PyCaret에서 특성 중요도

from pycaret.classification import *
from pycaret.datasets import get_data

# 085 데이터 로드
data = get_data('diabetes')

# 085 환경 설정
clf = setup(data, target='Class variable', session_id=42, verbose=False)

# 085 모델 생성
rf = create_model('rf')

# 085 특성 중요도 시각화
plot_model(rf, plot='feature')

트리 기반 모델 중요도

from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt
from pycaret.datasets import get_data

# 085 데이터 준비
data = get_data('diabetes')
X = data.drop('Class variable', axis=1)
y = data['Class variable']

# 085 랜덤 포레스트 학습
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# 085 불순도 기반 중요도
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("불순도 기반 특성 중요도:")
print(importance_df.round(4))

# 085 시각화
plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.title('Random Forest Feature Importance')
plt.gca().invert_yaxis()
plt.savefig('rf_importance.png', dpi=150)

순열 중요도 (Permutation Importance)

from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

# 085 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 085 모델 학습
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 085 순열 중요도 계산
perm_importance = permutation_importance(
    rf, X_test, y_test,
    n_repeats=30,  # 반복 횟수
    random_state=42,
    n_jobs=-1
)

# 085 결과 정리
perm_df = pd.DataFrame({
    'feature': X.columns,
    'importance_mean': perm_importance.importances_mean,
    'importance_std': perm_importance.importances_std
}).sort_values('importance_mean', ascending=False)

print("순열 중요도:")
print(perm_df.round(4))

# 085 시각화
plt.figure(figsize=(10, 6))
plt.barh(perm_df['feature'], perm_df['importance_mean'],
         xerr=perm_df['importance_std'], capsize=5)
plt.xlabel('Importance (accuracy decrease)')
plt.title('Permutation Feature Importance')
plt.gca().invert_yaxis()
plt.savefig('permutation_importance.png', dpi=150)

선형 모델 계수

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt

# 085 스케일링 (선형 모델에 중요)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 085 로지스틱 회귀
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_scaled, y)

# 085 계수 (절대값으로 중요도)
coef_df = pd.DataFrame({
    'feature': X.columns,
    'coefficient': lr.coef_[0],
    'abs_coefficient': abs(lr.coef_[0])
}).sort_values('abs_coefficient', ascending=False)

print("선형 모델 계수:")
print(coef_df.round(4))

# 085 시각화
plt.figure(figsize=(10, 6))
colors = ['red' if c < 0 else 'blue' for c in coef_df['coefficient']]
plt.barh(coef_df['feature'], coef_df['coefficient'], color=colors)
plt.axvline(x=0, color='black', linestyle='--')
plt.xlabel('Coefficient')
plt.title('Logistic Regression Coefficients')
plt.gca().invert_yaxis()
plt.savefig('lr_coefficients.png', dpi=150)

여러 방법 비교

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import shap

# 085 데이터 준비
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. 랜덤 포레스트 불순도 중요도
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_importance = rf.feature_importances_

# 2. 순열 중요도
perm = permutation_importance(rf, X_test, y_test, n_repeats=30, random_state=42)
perm_importance = perm.importances_mean

# 3. SHAP 중요도
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
shap_importance = abs(shap_values[1]).mean(axis=0)

# 085 정규화 (비교를 위해)
def normalize(arr):
    return (arr - arr.min()) / (arr.max() - arr.min() + 1e-10)

# 085 결과 비교
comparison = pd.DataFrame({
    'feature': X.columns,
    'RF_Impurity': normalize(rf_importance),
    'Permutation': normalize(perm_importance),
    'SHAP': normalize(shap_importance)
})

# 085 순위 상관관계
from scipy.stats import spearmanr
print("\n중요도 순위 상관관계:")
print(f"RF vs Permutation: {spearmanr(rf_importance, perm_importance)[0]:.3f}")
print(f"RF vs SHAP: {spearmanr(rf_importance, shap_importance)[0]:.3f}")
print(f"Permutation vs SHAP: {spearmanr(perm_importance, shap_importance)[0]:.3f}")

상관된 특성 문제

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# 085 상관된 특성이 있는 데이터
np.random.seed(42)
n_samples = 1000

feature1 = np.random.randn(n_samples)
feature2 = feature1 + np.random.randn(n_samples) * 0.1  # feature1과 매우 상관
feature3 = np.random.randn(n_samples)
target = (feature1 + feature3 > 0).astype(int)

X_corr = pd.DataFrame({
    'feature1': feature1,
    'feature2': feature2,  # feature1의 복사본
    'feature3': feature3
})

# 085 랜덤 포레스트
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_corr, target)

print("상관된 특성의 중요도:")
print(pd.DataFrame({
    'feature': X_corr.columns,
    'importance': rf.feature_importances_
}))

# 085 주의: feature1과 feature2가 중요도를 나눠 가짐
# 085 실제 중요한 feature1의 중요도가 낮아 보일 수 있음

특성 중요도 기반 선택

from pycaret.classification import *
import pandas as pd

clf = setup(data, target='Class variable', session_id=42, verbose=False)

rf = create_model('rf')

# 085 특성 중요도 가져오기
# 085 PyCaret에서는 get_config로 전처리된 데이터 접근
X_transformed = get_config('X_train_transformed')
feature_names = get_config('X_train_transformed').columns.tolist()

# 085 내장 중요도
importance = rf.feature_importances_

# 085 상위 N개 선택
top_n = 5
top_features = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
}).nlargest(top_n, 'importance')['feature'].tolist()

print(f"상위 {top_n}개 특성: {top_features}")

회귀 문제에서 특성 중요도

from pycaret.regression import *
from pycaret.datasets import get_data

# 085 회귀 데이터
data = get_data('boston')

# 085 환경 설정
reg = setup(data, target='medv', session_id=42, verbose=False)

# 085 모델 생성
rf = create_model('rf')

# 085 특성 중요도
plot_model(rf, plot='feature')

특성 중요도 보고서

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

def feature_importance_report(model, X_train, X_test, y_train, y_test, feature_names):
    """특성 중요도 종합 보고서"""

    # 1. 모델 내장 중요도
    builtin = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

    # 2. 순열 중요도
    perm = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42)
    permutation = pd.DataFrame({
        'feature': feature_names,
        'importance': perm.importances_mean
    }).sort_values('importance', ascending=False)

    # 보고서 생성
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    axes[0].barh(builtin['feature'].head(10), builtin['importance'].head(10))
    axes[0].set_title('Built-in Importance (Top 10)')
    axes[0].invert_yaxis()

    axes[1].barh(permutation['feature'].head(10), permutation['importance'].head(10))
    axes[1].set_title('Permutation Importance (Top 10)')
    axes[1].invert_yaxis()

    plt.tight_layout()
    plt.savefig('importance_report.png', dpi=150)

    # 합의된 상위 특성
    top_builtin = set(builtin['feature'].head(5))
    top_perm = set(permutation['feature'].head(5))
    consensus = top_builtin & top_perm

    print(f"\n두 방법 모두에서 상위 5개에 포함된 특성: {consensus}")

    return builtin, permutation

# 085 사용 예
# 085 rf = RandomForestClassifier(n_estimators=100, random_state=42)
# 085 rf.fit(X_train, y_train)
# 085 report = feature_importance_report(rf, X_train, X_test, y_train, y_test, X.columns)

정리

특성 중요도: 각 특성의 예측 기여도
불순도 기반: 트리 모델 내장, 빠름
순열 중요도: 범용적, 신뢰성 높음
SHAP: 이론적 기반, 개별 해석 가능
상관된 특성 주의 필요
여러 방법 비교 권장

다음 글 예고

다음 글에서는 모델 해석 - Partial Dependence를 다룹니다.

PyCaret 머신러닝 마스터 시리즈 #085

개요​

실습 환경​

특성 중요도의 종류​

PyCaret에서 특성 중요도​

트리 기반 모델 중요도​

순열 중요도 (Permutation Importance)​

선형 모델 계수​

여러 방법 비교​

상관된 특성 문제​

특성 중요도 기반 선택​

회귀 문제에서 특성 중요도​

특성 중요도 보고서​

정리​

다음 글 예고​

개요

실습 환경

특성 중요도의 종류

PyCaret에서 특성 중요도

트리 기반 모델 중요도

순열 중요도 (Permutation Importance)

선형 모델 계수

여러 방법 비교

상관된 특성 문제

특성 중요도 기반 선택

회귀 문제에서 특성 중요도

특성 중요도 보고서

정리

다음 글 예고