042 회귀에서의 이상치 처리

키워드: 이상치, outlier, 로버스트

개요

이상치(Outlier)는 다른 데이터와 크게 다른 값으로, 회귀 모델의 성능에 큰 영향을 미칩니다. 특히 선형 회귀에서 이상치는 전체 회귀선을 왜곡할 수 있습니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], scikit-learn, numpy

pip install flaml[automl] scikit-learn numpy matplotlib

이상치의 영향

선형 회귀에서의 영향

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# 042 정상 데이터
np.random.seed(42)
X_normal = np.random.rand(50, 1) * 10
y_normal = 2 * X_normal.flatten() + 5 + np.random.randn(50) * 2

# 042 이상치 추가
X_outlier = np.vstack([X_normal, [[5], [6]]])
y_outlier = np.append(y_normal, [50, 60])  # 극단적인 이상치

# 042 모델 학습
model_normal = LinearRegression().fit(X_normal, y_normal)
model_outlier = LinearRegression().fit(X_outlier, y_outlier)

# 042 시각화
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 042 이상치 없음
axes[0].scatter(X_normal, y_normal, alpha=0.6)
axes[0].plot(X_normal, model_normal.predict(X_normal), 'r-', linewidth=2)
axes[0].set_title(f'Without Outliers (slope={model_normal.coef_[0]:.2f})')
axes[0].set_xlabel('X')
axes[0].set_ylabel('y')

# 042 이상치 있음
axes[1].scatter(X_outlier, y_outlier, alpha=0.6)
axes[1].scatter(X_outlier[-2:], y_outlier[-2:], color='red', s=100, label='Outliers')
axes[1].plot(X_outlier, model_outlier.predict(X_outlier), 'r-', linewidth=2)
axes[1].set_title(f'With Outliers (slope={model_outlier.coef_[0]:.2f})')
axes[1].set_xlabel('X')
axes[1].set_ylabel('y')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"기울기 변화: {model_normal.coef_[0]:.2f} → {model_outlier.coef_[0]:.2f}")
print("→ 이상치 2개가 회귀선을 크게 왜곡!")

이상치 탐지 방법

1. IQR 방법

def detect_outliers_iqr(data, factor=1.5):
    """IQR 기반 이상치 탐지"""
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1

    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR

    outliers = (data < lower_bound) | (data > upper_bound)
    return outliers, lower_bound, upper_bound

# 042 테스트
y_test_data = np.append(y_normal, [100, -20])
outliers, lower, upper = detect_outliers_iqr(y_test_data)

print("IQR 방법:")
print(f"  정상 범위: {lower:.2f} ~ {upper:.2f}")
print(f"  이상치 수: {outliers.sum()}")
print(f"  이상치 값: {y_test_data[outliers]}")

2. Z-Score 방법

from scipy import stats

def detect_outliers_zscore(data, threshold=3):
    """Z-Score 기반 이상치 탐지"""
    z_scores = np.abs(stats.zscore(data))
    outliers = z_scores > threshold
    return outliers, z_scores

outliers_z, z_scores = detect_outliers_zscore(y_test_data)

print("\nZ-Score 방법 (threshold=3):")
print(f"  이상치 수: {outliers_z.sum()}")
print(f"  이상치 Z-scores: {z_scores[outliers_z].round(2)}")

3. Isolation Forest

from sklearn.ensemble import IsolationForest

def detect_outliers_iforest(X, contamination=0.05):
    """Isolation Forest 기반 이상치 탐지"""
    iso = IsolationForest(contamination=contamination, random_state=42)
    predictions = iso.fit_predict(X)
    outliers = predictions == -1
    return outliers

# 042 다변량 이상치 탐지
X_with_outliers = np.vstack([X_normal, [[15], [20]]])  # 범위 밖 X
outliers_if = detect_outliers_iforest(X_with_outliers)

print("\nIsolation Forest:")
print(f"  이상치 수: {outliers_if.sum()}")

이상치 처리 방법

1. 제거

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from flaml import AutoML
from sklearn.metrics import r2_score

# 042 데이터 준비
data = fetch_california_housing()
X = data.data
y = data.target

# 042 IQR로 타겟 이상치 탐지
outliers_mask, lower, upper = detect_outliers_iqr(y)
print(f"원본 데이터: {len(y)}개")
print(f"이상치: {outliers_mask.sum()}개")

# 042 이상치 제거
X_clean = X[~outliers_mask]
y_clean = y[~outliers_mask]
print(f"정제 후: {len(y_clean)}개")

# 042 분할 및 학습
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42
)

automl_clean = AutoML()
automl_clean.fit(X_train, y_train, task="regression", time_budget=30, verbose=0)
print(f"이상치 제거 후 R²: {automl_clean.score(X_test, y_test):.4f}")

2. 클리핑 (Winsorizing)

def clip_outliers(data, lower_percentile=1, upper_percentile=99):
    """백분위수 기반 클리핑"""
    lower = np.percentile(data, lower_percentile)
    upper = np.percentile(data, upper_percentile)
    return np.clip(data, lower, upper)

# 042 클리핑 적용
y_clipped = clip_outliers(y, 1, 99)

print("\n클리핑:")
print(f"  원본 범위: {y.min():.2f} ~ {y.max():.2f}")
print(f"  클리핑 후: {y_clipped.min():.2f} ~ {y_clipped.max():.2f}")

# 042 학습
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X, y_clipped, test_size=0.2, random_state=42
)

automl_clipped = AutoML()
automl_clipped.fit(X_train_c, y_train_c, task="regression", time_budget=30, verbose=0)

# 042 원본 테스트 데이터로 평가
_, X_test_orig, _, y_test_orig = train_test_split(X, y, test_size=0.2, random_state=42)
y_pred_clipped = automl_clipped.predict(X_test_orig)
print(f"클리핑 후 R² (원본 평가): {r2_score(y_test_orig, y_pred_clipped):.4f}")

3. 변환 (로그 변환)

# 042 로그 변환은 극단값의 영향을 줄임
y_log = np.log(y)

print("\n로그 변환:")
print(f"  원본 범위: {y.min():.2f} ~ {y.max():.2f}")
print(f"  로그 후: {y_log.min():.2f} ~ {y_log.max():.2f}")

4. 로버스트 모델 사용

from sklearn.linear_model import HuberRegressor, RANSACRegressor

# 042 Huber Regressor (이상치에 강건)
huber = HuberRegressor()
huber.fit(X_outlier, y_outlier)

# 042 RANSAC (이상치 자동 제외)
ransac = RANSACRegressor(random_state=42)
ransac.fit(X_outlier, y_outlier)

print("\n로버스트 모델 비교:")
print(f"  Linear Regression 기울기: {model_outlier.coef_[0]:.2f}")
print(f"  Huber Regression 기울기: {huber.coef_[0]:.2f}")
print(f"  RANSAC 기울기: {ransac.estimator_.coef_[0]:.2f}")
print(f"  실제 기울기: 2.00")

FLAML에서 이상치 처리

종합 파이프라인

def handle_outliers_pipeline(X, y, method='clip', **kwargs):
    """이상치 처리 파이프라인"""

    if method == 'remove':
        # IQR 기반 제거
        factor = kwargs.get('factor', 1.5)
        Q1, Q3 = np.percentile(y, [25, 75])
        IQR = Q3 - Q1
        mask = (y >= Q1 - factor * IQR) & (y <= Q3 + factor * IQR)
        return X[mask], y[mask]

    elif method == 'clip':
        # 백분위수 클리핑
        lower = kwargs.get('lower_percentile', 1)
        upper = kwargs.get('upper_percentile', 99)
        y_clipped = np.clip(y, np.percentile(y, lower), np.percentile(y, upper))
        return X, y_clipped

    elif method == 'log':
        # 로그 변환
        return X, np.log1p(y)

    else:
        return X, y

# 042 테스트
methods = ['none', 'remove', 'clip', 'log']
results = {}

for method in methods:
    if method == 'none':
        X_proc, y_proc = X, y
    else:
        X_proc, y_proc = handle_outliers_pipeline(X, y, method=method)

    X_train, X_test, y_train, y_test = train_test_split(
        X_proc, y_proc, test_size=0.2, random_state=42
    )

    automl = AutoML()
    automl.fit(X_train, y_train, task="regression", time_budget=30, verbose=0)

    # 평가 (로그의 경우 역변환)
    y_pred = automl.predict(X_test)
    if method == 'log':
        y_pred = np.expm1(y_pred)
        y_test_eval = np.expm1(y_test)
    else:
        y_test_eval = y_test

    results[method] = r2_score(y_test_eval, y_pred)

print("\n이상치 처리 방법별 R²:")
for method, r2 in results.items():
    print(f"  {method}: {r2:.4f}")

처리 방법 선택 가이드

import pandas as pd

guide = {
    '상황': ['이상치가 오류', '이상치가 실제 값', '이상치가 적음', '이상치가 많음'],
    '권장 방법': ['제거', '클리핑 또는 변환', '제거', 'Huber/로그 변환'],
    '이유': [
        '잘못된 데이터 제거',
        '정보 손실 최소화',
        '영향 제한적',
        '제거 시 데이터 손실 큼'
    ]
}

print("\n이상치 처리 방법 선택 가이드:")
print(pd.DataFrame(guide).to_string(index=False))

정리

이상치는 회귀 모델, 특히 선형 모델에 큰 영향을 미칩니다.
탐지 방법: IQR, Z-Score, Isolation Forest
처리 방법: 제거, 클리핑, 변환, 로버스트 모델
트리 기반 모델(LightGBM, XGBoost)은 이상치에 상대적으로 강건합니다.
이상치가 오류인지 실제 값인지 판단하는 것이 중요합니다.
FLAML은 트리 모델을 선호하므로 이상치 영향이 제한적입니다.

다음 글 예고

다음 글에서는 회귀 프로젝트 - 자동차 가격 예측에 대해 알아보겠습니다. 실제 자동차 데이터를 활용한 회귀 프로젝트를 진행합니다.

FLAML AutoML 마스터 시리즈 #042

개요​

실습 환경​

이상치의 영향​

선형 회귀에서의 영향​

이상치 탐지 방법​

1. IQR 방법​

2. Z-Score 방법​

3. Isolation Forest​

이상치 처리 방법​

1. 제거​

2. 클리핑 (Winsorizing)​

3. 변환 (로그 변환)​

4. 로버스트 모델 사용​

FLAML에서 이상치 처리​

종합 파이프라인​

처리 방법 선택 가이드​

정리​

다음 글 예고​

개요

실습 환경

이상치의 영향

선형 회귀에서의 영향

이상치 탐지 방법

1. IQR 방법

2. Z-Score 방법

3. Isolation Forest

이상치 처리 방법

1. 제거

2. 클리핑 (Winsorizing)

3. 변환 (로그 변환)

4. 로버스트 모델 사용

FLAML에서 이상치 처리

종합 파이프라인

처리 방법 선택 가이드

정리

다음 글 예고