047 잔차 분석과 모델 진단

키워드: 잔차, residual, 모델 진단

개요

잔차(Residual)는 실제값과 예측값의 차이로, 회귀 모델의 성능을 진단하는 핵심 도구입니다. 잔차 분석을 통해 모델의 문제점을 파악하고 개선 방향을 찾을 수 있습니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], scikit-learn, numpy, scipy

pip install flaml[automl] scikit-learn numpy matplotlib scipy statsmodels

잔차의 기본 개념

잔차 계산

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from flaml import AutoML

# 047 데이터 준비
data = fetch_california_housing()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 047 FLAML 모델 학습
automl = AutoML()
automl.fit(
    X_train, y_train,
    task="regression",
    time_budget=60,
    metric="mse",
    verbose=0
)

# 047 예측 및 잔차 계산
y_pred = automl.predict(X_test)
residuals = y_test - y_pred

print("잔차 통계:")
print(f"  평균: {residuals.mean():.4f} (0에 가까워야 함)")
print(f"  표준편차: {residuals.std():.4f}")
print(f"  최소: {residuals.min():.4f}")
print(f"  최대: {residuals.max():.4f}")

잔차 유형

# 047 다양한 잔차 유형
residuals_raw = y_test - y_pred  # 원시 잔차
residuals_standardized = residuals / residuals.std()  # 표준화 잔차
residuals_percent = (y_test - y_pred) / y_test * 100  # 백분율 잔차

print("\n잔차 유형별 통계:")
print(f"  원시 잔차 범위: [{residuals_raw.min():.2f}, {residuals_raw.max():.2f}]")
print(f"  표준화 잔차 범위: [{residuals_standardized.min():.2f}, {residuals_standardized.max():.2f}]")
print(f"  백분율 잔차 평균: {np.abs(residuals_percent).mean():.2f}%")

잔차 시각화

기본 잔차 플롯

fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# 1. 예측값 vs 잔차 (가장 중요!)
axes[0, 0].scatter(y_pred, residuals, alpha=0.3, s=10)
axes[0, 0].axhline(y=0, color='r', linestyle='--')
axes[0, 0].set_xlabel('Predicted Values')
axes[0, 0].set_ylabel('Residuals')
axes[0, 0].set_title('Residuals vs Predicted')

# 2. 잔차 히스토그램
axes[0, 1].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[0, 1].axvline(x=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('Residual')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Residual Distribution')

# 3. Q-Q Plot (정규성 검정)
from scipy import stats
stats.probplot(residuals, dist="norm", plot=axes[1, 0])
axes[1, 0].set_title('Q-Q Plot')

# 4. 실제값 vs 예측값
axes[1, 1].scatter(y_test, y_pred, alpha=0.3, s=10)
axes[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
axes[1, 1].set_xlabel('Actual Values')
axes[1, 1].set_ylabel('Predicted Values')
axes[1, 1].set_title('Actual vs Predicted')

plt.tight_layout()
plt.show()

잔차 패턴 해석

def analyze_residual_pattern(y_true, y_pred):
    """잔차 패턴 분석"""
    residuals = y_true - y_pred

    # 정규성 검정
    _, p_value = stats.shapiro(residuals[:5000] if len(residuals) > 5000 else residuals)

    # 자기상관 (Durbin-Watson)
    from statsmodels.stats.stattools import durbin_watson
    dw = durbin_watson(residuals)

    # 이분산성 간단 테스트
    low_pred = y_pred < np.median(y_pred)
    high_pred = y_pred >= np.median(y_pred)
    var_ratio = residuals[high_pred].var() / residuals[low_pred].var()

    print("잔차 패턴 분석:")
    print(f"\n1. 정규성 (Shapiro-Wilk):")
    print(f"   p-value: {p_value:.4f}")
    print(f"   → {'정규 분포로 볼 수 있음' if p_value > 0.05 else '비정규 분포'}")

    print(f"\n2. 자기상관 (Durbin-Watson):")
    print(f"   DW 통계량: {dw:.4f}")
    print(f"   → {'자기상관 없음' if 1.5 < dw < 2.5 else '자기상관 존재 가능'}")

    print(f"\n3. 이분산성:")
    print(f"   분산 비율 (고예측/저예측): {var_ratio:.4f}")
    print(f"   → {'등분산 가정 충족' if 0.5 < var_ratio < 2 else '이분산성 의심'}")

    return {
        'normality_p': p_value,
        'durbin_watson': dw,
        'variance_ratio': var_ratio
    }

# 047 분석
results = analyze_residual_pattern(y_test, y_pred)

잔차 문제별 해결책

1. 비선형 패턴

# 047 비선형 패턴이 보일 때
print("비선형 패턴 해결책:")
print("  1. 다항 특성 추가")
print("  2. 트리 기반 모델 사용 (FLAML 기본)")
print("  3. 특성 변환 (로그, 제곱근)")

# 047 예: 다항 특성 추가
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_train)
print(f"\n다항 특성 추가: {X_train.shape[1]} → {X_poly.shape[1]} 특성")

2. 이분산성

# 047 이분산성 시각화
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 047 스케일 위치 플롯
sqrt_abs_residuals = np.sqrt(np.abs(residuals))
axes[0].scatter(y_pred, sqrt_abs_residuals, alpha=0.3, s=10)
axes[0].set_xlabel('Predicted Values')
axes[0].set_ylabel('√|Residuals|')
axes[0].set_title('Scale-Location Plot')

# 047 해결: 가중 회귀 또는 타겟 변환
print("\n이분산성 해결책:")
print("  1. 타겟 로그 변환")
print("  2. 가중 회귀 사용")
print("  3. 분산 안정화 변환")

# 047 로그 변환 예시
y_train_log = np.log(y_train + 1)
y_pred_log_transformed = automl.predict(X_test)  # 실제로는 재학습 필요
axes[1].hist(np.log(y_test + 1) - y_pred_log_transformed, bins=50, alpha=0.7)
axes[1].set_title('Residuals after Log Transform (conceptual)')

plt.tight_layout()
plt.show()

3. 이상치 탐지

def detect_outliers_from_residuals(residuals, threshold=3):
    """잔차 기반 이상치 탐지"""
    standardized = (residuals - residuals.mean()) / residuals.std()
    outliers = np.abs(standardized) > threshold

    print(f"이상치 탐지 (|표준화 잔차| > {threshold}):")
    print(f"  이상치 수: {outliers.sum()} ({outliers.mean()*100:.2f}%)")

    return outliers

# 047 이상치 탐지
outliers = detect_outliers_from_residuals(residuals)

# 047 시각화
plt.figure(figsize=(10, 6))
colors = np.where(outliers, 'red', 'blue')
plt.scatter(y_pred, residuals, c=colors, alpha=0.5, s=10)
plt.axhline(y=0, color='black', linestyle='--')
plt.axhline(y=3*residuals.std(), color='red', linestyle=':', label='±3σ')
plt.axhline(y=-3*residuals.std(), color='red', linestyle=':')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Outlier Detection from Residuals')
plt.legend()
plt.show()

특성별 잔차 분석

# 047 특성별 잔차 관계
feature_names = data.feature_names

fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()

for i, (name, ax) in enumerate(zip(feature_names, axes)):
    ax.scatter(X_test[:, i], residuals, alpha=0.3, s=5)
    ax.axhline(y=0, color='r', linestyle='--')
    ax.set_xlabel(name)
    ax.set_ylabel('Residual')
    ax.set_title(f'Residual vs {name}')

plt.tight_layout()
plt.show()

# 047 패턴 분석
print("\n특성별 잔차 상관관계:")
for i, name in enumerate(feature_names):
    corr = np.corrcoef(X_test[:, i], residuals)[0, 1]
    if abs(corr) > 0.1:
        print(f"  {name}: {corr:.4f} (패턴 의심)")

종합 진단 함수

def comprehensive_residual_analysis(y_true, y_pred, feature_names=None, X=None):
    """종합 잔차 분석"""
    residuals = y_true - y_pred

    print("=" * 60)
    print("종합 잔차 분석 보고서")
    print("=" * 60)

    # 1. 기본 통계
    print("\n1. 기본 통계")
    print(f"   샘플 수: {len(residuals)}")
    print(f"   평균: {residuals.mean():.4f}")
    print(f"   표준편차: {residuals.std():.4f}")
    print(f"   중앙값: {np.median(residuals):.4f}")

    # 2. 정규성
    _, p_shapiro = stats.shapiro(residuals[:5000] if len(residuals) > 5000 else residuals)
    _, p_ks = stats.kstest(residuals, 'norm', args=(residuals.mean(), residuals.std()))

    print("\n2. 정규성 검정")
    print(f"   Shapiro-Wilk p-value: {p_shapiro:.4f}")
    print(f"   Kolmogorov-Smirnov p-value: {p_ks:.4f}")
    print(f"   왜도: {stats.skew(residuals):.4f}")
    print(f"   첨도: {stats.kurtosis(residuals):.4f}")

    # 3. 이상치
    outlier_threshold = 3
    outliers = np.abs((residuals - residuals.mean()) / residuals.std()) > outlier_threshold

    print(f"\n3. 이상치 (|z| > {outlier_threshold})")
    print(f"   이상치 수: {outliers.sum()} ({outliers.mean()*100:.2f}%)")

    # 4. 예측 범위별 성능
    print("\n4. 예측 범위별 성능 (MAE)")
    quantiles = np.percentile(y_pred, [25, 50, 75])
    for i, (low, high) in enumerate([(0, quantiles[0]), (quantiles[0], quantiles[1]),
                                      (quantiles[1], quantiles[2]), (quantiles[2], np.inf)]):
        mask = (y_pred >= low) & (y_pred < high)
        if mask.sum() > 0:
            mae = np.abs(residuals[mask]).mean()
            print(f"   Q{i+1}: MAE = {mae:.4f}")

    # 권장사항
    print("\n5. 권장사항")
    recommendations = []

    if p_shapiro < 0.05:
        recommendations.append("- 타겟 변환 고려 (로그, Box-Cox)")
    if abs(stats.skew(residuals)) > 1:
        recommendations.append("- 비대칭 분포: 로버스트 손실 함수 사용")
    if outliers.mean() > 0.05:
        recommendations.append("- 이상치 처리 필요")

    if not recommendations:
        recommendations.append("- 잔차 분포 양호")

    for rec in recommendations:
        print(f"   {rec}")

    print("=" * 60)

# 047 실행
comprehensive_residual_analysis(y_test, y_pred, data.feature_names, X_test)

정리

잔차 = 실제값 - 예측값: 모델 진단의 핵심 도구
잔차 vs 예측값 플롯: 패턴이 없어야 좋은 모델
Q-Q Plot: 정규성 확인
이분산성: 예측값에 따라 잔차 분산이 달라지면 문제
이상치: 표준화 잔차 |z| > 3이면 이상치로 간주
잔차 패턴이 보이면 특성 추가/변환을 고려

다음 글 예고

다음 글에서는 다중 출력 회귀에 대해 알아보겠습니다. 여러 타겟을 동시에 예측하는 방법을 다룹니다.

FLAML AutoML 마스터 시리즈 #047

개요​

실습 환경​

잔차의 기본 개념​

잔차 계산​

잔차 유형​

잔차 시각화​

기본 잔차 플롯​

잔차 패턴 해석​

잔차 문제별 해결책​

1. 비선형 패턴​

2. 이분산성​

3. 이상치 탐지​

특성별 잔차 분석​

종합 진단 함수​

정리​

다음 글 예고​

개요

실습 환경

잔차의 기본 개념

잔차 계산

잔차 유형

잔차 시각화

기본 잔차 플롯

잔차 패턴 해석

잔차 문제별 해결책

1. 비선형 패턴

2. 이분산성

3. 이상치 탐지

특성별 잔차 분석

종합 진단 함수

정리

다음 글 예고