039 보스턴 주택 vs 캘리포니아 주택 비교

키워드: 데이터세트 비교, 회귀, 특성 분석

개요

보스턴 주택과 캘리포니아 주택 데이터세트는 회귀 문제의 대표적인 벤치마크입니다. 이 글에서는 두 데이터세트의 특성과 모델링 결과를 비교하여 회귀 문제에 대한 이해를 높입니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], pandas, scikit-learn

pip install flaml[automl] pandas scikit-learn matplotlib

데이터세트 개요

기본 정보 비교

import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
import warnings
warnings.filterwarnings('ignore')

# 039 캘리포니아 주택 데이터
california = fetch_california_housing()
X_cal = pd.DataFrame(california.data, columns=california.feature_names)
y_cal = california.target

# 039 보스턴 주택 데이터 (직접 생성 - 윤리적 이유로 sklearn에서 제거됨)
# 039 대신 유사한 특성의 합성 데이터 생성
np.random.seed(42)
n_boston = 506

boston_features = {
    'CRIM': np.random.exponential(3, n_boston),      # 범죄율
    'ZN': np.random.uniform(0, 100, n_boston),       # 주거지역 비율
    'INDUS': np.random.uniform(0, 30, n_boston),     # 산업 비율
    'CHAS': np.random.choice([0, 1], n_boston),      # 강변 여부
    'NOX': np.random.uniform(0.3, 0.9, n_boston),    # 질소산화물 농도
    'RM': np.random.normal(6.3, 0.7, n_boston),      # 평균 방 개수
    'AGE': np.random.uniform(0, 100, n_boston),      # 주택 연령
    'DIS': np.random.exponential(4, n_boston),       # 고용 중심지 거리
    'RAD': np.random.choice(range(1, 25), n_boston), # 고속도로 접근성
    'TAX': np.random.uniform(150, 700, n_boston),    # 재산세율
    'PTRATIO': np.random.uniform(12, 22, n_boston),  # 학생/교사 비율
    'B': np.random.uniform(0, 400, n_boston),        # 인구 비율
    'LSTAT': np.random.uniform(2, 38, n_boston),     # 저소득층 비율
}

X_boston = pd.DataFrame(boston_features)
# 039 타겟: RM과 LSTAT에 기반한 가격
y_boston = 10 + 5 * X_boston['RM'] - 0.5 * X_boston['LSTAT'] + np.random.normal(0, 3, n_boston)
y_boston = np.clip(y_boston, 5, 50)

# 039 비교 테이블
comparison = pd.DataFrame({
    '항목': ['샘플 수', '특성 수', '타겟 범위', '지역', '단위'],
    'California': [len(X_cal), X_cal.shape[1], f'{y_cal.min():.1f} - {y_cal.max():.1f}', '캘리포니아', '100,000$'],
    'Boston (시뮬)': [len(X_boston), X_boston.shape[1], f'{y_boston.min():.1f} - {y_boston.max():.1f}', '보스턴', '1,000$']
})

print("데이터세트 기본 비교:")
print(comparison.to_string(index=False))

특성 비교

# 039 캘리포니아 특성
cal_features = {
    'MedInc': '중간 소득',
    'HouseAge': '주택 연령',
    'AveRooms': '평균 방 수',
    'AveBedrms': '평균 침실 수',
    'Population': '인구',
    'AveOccup': '평균 가구원',
    'Latitude': '위도',
    'Longitude': '경도'
}

# 039 보스턴 특성
boston_features_desc = {
    'CRIM': '범죄율',
    'ZN': '주거지역 비율',
    'INDUS': '산업지역 비율',
    'CHAS': '강변 여부',
    'NOX': '질소산화물 농도',
    'RM': '평균 방 수',
    'AGE': '오래된 주택 비율',
    'DIS': '고용 중심지 거리',
    'RAD': '고속도로 접근성',
    'TAX': '재산세율',
    'PTRATIO': '학생/교사 비율',
    'B': '인구 비율',
    'LSTAT': '저소득층 비율'
}

print("\n특성 비교:")
print("\n캘리포니아 (8개):")
for feat, desc in cal_features.items():
    print(f"  {feat}: {desc}")

print("\n보스턴 (13개):")
for feat, desc in boston_features_desc.items():
    print(f"  {feat}: {desc}")

통계 비교

import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 039 캘리포니아 타겟 분포
axes[0].hist(y_cal, bins=50, edgecolor='black', alpha=0.7, color='blue')
axes[0].set_xlabel('Price (100k$)')
axes[0].set_ylabel('Frequency')
axes[0].set_title(f'California House Prices\n(Mean: ${y_cal.mean()*100:.0f}k)')

# 039 보스턴 타겟 분포
axes[1].hist(y_boston, bins=50, edgecolor='black', alpha=0.7, color='green')
axes[1].set_xlabel('Price (1k$)')
axes[1].set_ylabel('Frequency')
axes[1].set_title(f'Boston House Prices (Simulated)\n(Mean: ${y_boston.mean():.0f}k)')

plt.tight_layout()
plt.show()

FLAML 모델링 비교

from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
import time

def train_and_evaluate(X, y, name, time_budget=60):
    """데이터세트별 학습 및 평가"""
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    automl = AutoML()
    start_time = time.time()

    automl.fit(
        X_train, y_train,
        task="regression",
        time_budget=time_budget,
        metric="r2",
        verbose=0
    )

    training_time = time.time() - start_time

    y_pred = automl.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return {
        'name': name,
        'model': automl.best_estimator,
        'r2': r2,
        'mae': mae,
        'time': training_time
    }

# 039 캘리포니아
cal_result = train_and_evaluate(X_cal, y_cal, 'California', time_budget=60)

# 039 보스턴
boston_result = train_and_evaluate(X_boston, y_boston, 'Boston', time_budget=60)

# 039 결과 비교
print("\n모델링 결과 비교:")
print("-" * 60)
print(f"{'데이터세트':<15} {'최적 모델':<15} {'R²':<10} {'MAE':<10}")
print("-" * 60)
print(f"{cal_result['name']:<15} {cal_result['model']:<15} {cal_result['r2']:<10.4f} {cal_result['mae']:<10.4f}")
print(f"{boston_result['name']:<15} {boston_result['model']:<15} {boston_result['r2']:<10.4f} {boston_result['mae']:<10.4f}")

특성 중요도 비교

from flaml import AutoML

# 039 캘리포니아 재학습 (특성 중요도 추출용)
X_train_cal, X_test_cal, y_train_cal, y_test_cal = train_test_split(
    X_cal, y_cal, test_size=0.2, random_state=42
)

automl_cal = AutoML()
automl_cal.fit(X_train_cal, y_train_cal, task="regression", time_budget=60,
               estimator_list=["lgbm"], verbose=0)

# 039 보스턴 재학습
X_train_bos, X_test_bos, y_train_bos, y_test_bos = train_test_split(
    X_boston, y_boston, test_size=0.2, random_state=42
)

automl_bos = AutoML()
automl_bos.fit(X_train_bos, y_train_bos, task="regression", time_budget=60,
               estimator_list=["lgbm"], verbose=0)

# 039 특성 중요도 시각화
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

if hasattr(automl_cal.best_model, 'feature_importances_'):
    imp_cal = automl_cal.best_model.feature_importances_
    sorted_idx = np.argsort(imp_cal)
    axes[0].barh(range(len(imp_cal)), imp_cal[sorted_idx])
    axes[0].set_yticks(range(len(imp_cal)))
    axes[0].set_yticklabels(X_cal.columns[sorted_idx])
    axes[0].set_title('California - Feature Importance')
    axes[0].set_xlabel('Importance')

if hasattr(automl_bos.best_model, 'feature_importances_'):
    imp_bos = automl_bos.best_model.feature_importances_
    sorted_idx = np.argsort(imp_bos)
    axes[1].barh(range(len(imp_bos)), imp_bos[sorted_idx])
    axes[1].set_yticks(range(len(imp_bos)))
    axes[1].set_yticklabels(X_boston.columns[sorted_idx])
    axes[1].set_title('Boston - Feature Importance')
    axes[1].set_xlabel('Importance')

plt.tight_layout()
plt.show()

주요 차이점 분석

differences = {
    '항목': ['데이터 크기', '지리 정보', '환경 요인', '사회 요인', '난이도'],
    'California': [
        '큼 (20,640)',
        '위도/경도 포함',
        '없음',
        '소득 중심',
        '중간 (R² ~0.85)'
    ],
    'Boston': [
        '작음 (506)',
        '없음',
        '질소산화물, 범죄율',
        '다양 (교육, 소득)',
        '쉬움 (R² ~0.90+)'
    ]
}

print("\n주요 차이점:")
print(pd.DataFrame(differences).to_string(index=False))

데이터세트 선택 가이드

selection_guide = {
    '목적': ['회귀 입문', '특성 엔지니어링 연습', '대용량 처리 연습', '지리 데이터 활용'],
    '권장': ['Boston', 'Boston', 'California', 'California'],
    '이유': [
        '작은 크기, 높은 성능',
        '다양한 특성 유형',
        '2만+ 샘플',
        '위도/경도 포함'
    ]
}

print("\n데이터세트 선택 가이드:")
print(pd.DataFrame(selection_guide).to_string(index=False))

교훈

lessons = [
    "데이터 크기가 항상 성능을 보장하지 않음",
    "특성의 질이 양보다 중요할 수 있음",
    "지리 정보는 부동산 예측에 매우 중요",
    "도메인 지식이 특성 선택에 핵심",
    "작은 데이터도 좋은 모델을 만들 수 있음"
]

print("\n교훈:")
for i, lesson in enumerate(lessons, 1):
    print(f"  {i}. {lesson}")

정리

항목	California	Boston
샘플 수	20,640	506
특성 수	8	13
지리 정보	있음	없음
환경 요인	없음	있음
핵심 특성	소득, 위치	방 수, 저소득층
예측 난이도	중간	쉬움

다음 글 예고

다음 글에서는 회귀에서의 특성 스케일링에 대해 알아보겠습니다. 스케일링이 회귀 모델에 미치는 영향과 적용 방법을 다룹니다.

FLAML AutoML 마스터 시리즈 #039

개요​

실습 환경​

데이터세트 개요​

기본 정보 비교​

특성 비교​

통계 비교​

FLAML 모델링 비교​

특성 중요도 비교​

주요 차이점 분석​

데이터세트 선택 가이드​

교훈​

정리​

다음 글 예고​

개요

실습 환경

데이터세트 개요

기본 정보 비교

특성 비교

통계 비교

FLAML 모델링 비교

특성 중요도 비교

주요 차이점 분석

데이터세트 선택 가이드

교훈

정리

다음 글 예고