038 캘리포니아 주택 가격 예측 프로젝트

키워드: 주택 가격, 회귀 프로젝트, California Housing

개요

캘리포니아 주택 데이터셋은 회귀 문제의 대표적인 벤치마크입니다. 지역의 인구통계, 주택 정보를 바탕으로 중간 주택 가격을 예측합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], pandas, scikit-learn, seaborn

pip install flaml[automl] pandas scikit-learn seaborn matplotlib

프로젝트 개요

목표

캘리포니아 각 지역의 중간 주택 가격 예측

데이터세트

샘플 수: 20,640개 (지역 블록 그룹)
특성 수: 8개
타겟: 중간 주택 가격 (10만 달러 단위)

Step 1: 데이터 로드

import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
import matplotlib.pyplot as plt
import seaborn as sns

# 038 데이터 로드
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['MedHouseVal'] = housing.target

print("캘리포니아 주택 데이터:")
print(f"  크기: {df.shape}")
print(f"\n특성 설명:")
descriptions = {
    'MedInc': '중간 소득 (10,000달러)',
    'HouseAge': '주택 평균 연령',
    'AveRooms': '평균 방 개수',
    'AveBedrms': '평균 침실 개수',
    'Population': '블록 그룹 인구',
    'AveOccup': '평균 가구원 수',
    'Latitude': '위도',
    'Longitude': '경도',
    'MedHouseVal': '중간 주택 가격 (100,000달러)'
}
for col, desc in descriptions.items():
    print(f"  {col}: {desc}")

Step 2: 탐색적 데이터 분석

기본 통계

print("\n기본 통계:")
print(df.describe().round(2))

print(f"\n타겟 변수 분포:")
print(f"  최소: ${df['MedHouseVal'].min() * 100000:,.0f}")
print(f"  최대: ${df['MedHouseVal'].max() * 100000:,.0f}")
print(f"  평균: ${df['MedHouseVal'].mean() * 100000:,.0f}")
print(f"  중앙값: ${df['MedHouseVal'].median() * 100000:,.0f}")

타겟 분포 시각화

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 038 히스토그램
axes[0].hist(df['MedHouseVal'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Median House Value (100k$)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of House Prices')

# 038 박스플롯
axes[1].boxplot(df['MedHouseVal'])
axes[1].set_ylabel('Median House Value (100k$)')
axes[1].set_title('House Price Box Plot')

plt.tight_layout()
plt.show()

# 038 상한값 확인
print(f"\n최대값 클리핑 확인: {(df['MedHouseVal'] == 5.0).sum()}개가 5.0")

특성 간 상관관계

# 038 상관관계 히트맵
plt.figure(figsize=(10, 8))
correlation = df.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# 038 타겟과의 상관관계
print("\n타겟(MedHouseVal)과의 상관관계:")
target_corr = correlation['MedHouseVal'].sort_values(ascending=False)
for feature, corr in target_corr.items():
    if feature != 'MedHouseVal':
        print(f"  {feature}: {corr:.4f}")

지리적 분포

plt.figure(figsize=(12, 10))
scatter = plt.scatter(
    df['Longitude'], df['Latitude'],
    c=df['MedHouseVal'], cmap='viridis',
    alpha=0.5, s=1
)
plt.colorbar(scatter, label='Median House Value (100k$)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('California House Prices by Location')
plt.tight_layout()
plt.show()

Step 3: 데이터 전처리

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 038 특성과 타겟 분리
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# 038 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"학습 데이터: {X_train.shape}")
print(f"테스트 데이터: {X_test.shape}")

# 038 결측치 확인
print(f"\n결측치: {X_train.isnull().sum().sum()}")

Step 4: FLAML AutoML 학습

from flaml import AutoML

# 038 FLAML 설정
automl = AutoML()

# 038 학습
automl.fit(
    X_train, y_train,
    task="regression",
    time_budget=120,  # 2분
    metric="r2",
    seed=42,
    verbose=1
)

print(f"\n최적 모델: {automl.best_estimator}")
print(f"검증 R²: {1 - automl.best_loss:.4f}")
print(f"\n최적 하이퍼파라미터:")
for key, value in automl.best_config.items():
    print(f"  {key}: {value}")

Step 5: 모델 평가

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 038 예측
y_pred = automl.predict(X_test)

# 038 평가 지표
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("테스트 성능:")
print(f"  MSE: {mse:.4f}")
print(f"  RMSE: {rmse:.4f} (${rmse * 100000:,.0f})")
print(f"  MAE: {mae:.4f} (${mae * 100000:,.0f})")
print(f"  R²: {r2:.4f}")

예측 vs 실제 시각화

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# 038 산점도
axes[0].scatter(y_test, y_pred, alpha=0.3, s=10)
axes[0].plot([0, 5], [0, 5], 'r--', linewidth=2)
axes[0].set_xlabel('Actual Price (100k$)')
axes[0].set_ylabel('Predicted Price (100k$)')
axes[0].set_title(f'Actual vs Predicted (R² = {r2:.4f})')
axes[0].set_xlim(0, 5.5)
axes[0].set_ylim(0, 5.5)

# 038 잔차 분포
residuals = y_test - y_pred
axes[1].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[1].axvline(x=0, color='r', linestyle='--')
axes[1].set_xlabel('Residual (Actual - Predicted)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Residual Distribution')

plt.tight_layout()
plt.show()

print(f"\n잔차 통계:")
print(f"  평균: {residuals.mean():.4f}")
print(f"  표준편차: {residuals.std():.4f}")

Step 6: 특성 중요도

if hasattr(automl.best_model, 'feature_importances_'):
    importance = automl.best_model.feature_importances_
    feature_names = X.columns

    # 정렬
    sorted_idx = np.argsort(importance)[::-1]

    plt.figure(figsize=(10, 6))
    plt.bar(range(len(importance)), importance[sorted_idx])
    plt.xticks(range(len(importance)), feature_names[sorted_idx], rotation=45)
    plt.xlabel('Feature')
    plt.ylabel('Importance')
    plt.title('Feature Importance for House Price Prediction')
    plt.tight_layout()
    plt.show()

    print("특성 중요도:")
    for i in sorted_idx:
        print(f"  {feature_names[i]}: {importance[i]:.4f}")

Step 7: 오차 분석

# 038 오차가 큰 샘플 분석
error_df = pd.DataFrame({
    'actual': y_test.values,
    'predicted': y_pred,
    'error': residuals.values,
    'abs_error': np.abs(residuals.values)
})

# 038 상위 10개 오차
print("오차가 큰 샘플 (Top 10):")
top_errors = error_df.nlargest(10, 'abs_error')
print(top_errors)

# 038 오차 패턴 분석
error_df['price_range'] = pd.cut(error_df['actual'], bins=[0, 1, 2, 3, 4, 5], labels=['0-1', '1-2', '2-3', '3-4', '4-5'])
print("\n가격대별 MAE:")
print(error_df.groupby('price_range')['abs_error'].mean().round(4))

Step 8: 새 데이터 예측

# 038 새로운 지역 데이터
new_areas = pd.DataFrame({
    'MedInc': [8.0, 3.5, 5.0],
    'HouseAge': [20, 40, 10],
    'AveRooms': [6.0, 4.5, 7.0],
    'AveBedrms': [1.1, 1.0, 1.2],
    'Population': [1000, 2000, 500],
    'AveOccup': [2.5, 3.0, 2.0],
    'Latitude': [37.5, 34.0, 38.5],
    'Longitude': [-122.0, -118.5, -121.5]
})

# 038 예측
predictions = automl.predict(new_areas)

print("새 지역 예측:")
for i, pred in enumerate(predictions):
    print(f"\n지역 {i+1}:")
    print(f"  예측 가격: ${pred * 100000:,.0f}")
    print(f"  입력 특성: 소득={new_areas.iloc[i]['MedInc']}만$, "
          f"주택연령={new_areas.iloc[i]['HouseAge']}년")

Step 9: 모델 저장

import pickle

# 038 모델 저장
with open('california_housing_model.pkl', 'wb') as f:
    pickle.dump(automl, f)

print("모델 저장 완료: california_housing_model.pkl")

# 038 모델 로드 및 테스트
with open('california_housing_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

test_pred = loaded_model.predict(X_test[:5])
print(f"\n로드된 모델 테스트 예측: {test_pred.round(2)}")

정리

캘리포니아 주택 데이터로 회귀 프로젝트 전체 과정을 수행했습니다.
**MedInc(중간 소득)**이 가장 중요한 특성입니다.
R² = 0.85+ 수준의 좋은 예측 성능을 달성했습니다.
지리적 위치(위도, 경도)도 중요한 특성입니다.
고가 주택(5.0)은 상한 클리핑되어 있어 예측이 제한됩니다.

다음 글 예고

다음 글에서는 보스턴 주택 가격 vs 캘리포니아 비교에 대해 알아보겠습니다. 두 데이터세트의 특성과 모델링 차이를 비교합니다.

FLAML AutoML 마스터 시리즈 #038

개요​

실습 환경​

프로젝트 개요​

목표​

데이터세트​

Step 1: 데이터 로드​

Step 2: 탐색적 데이터 분석​

기본 통계​

타겟 분포 시각화​

특성 간 상관관계​

지리적 분포​

Step 3: 데이터 전처리​

Step 4: FLAML AutoML 학습​

Step 5: 모델 평가​

예측 vs 실제 시각화​

Step 6: 특성 중요도​

Step 7: 오차 분석​

Step 8: 새 데이터 예측​

Step 9: 모델 저장​

정리​

다음 글 예고​

개요

실습 환경

프로젝트 개요

목표

데이터세트

Step 1: 데이터 로드

Step 2: 탐색적 데이터 분석

기본 통계

타겟 분포 시각화

특성 간 상관관계

지리적 분포

Step 3: 데이터 전처리

Step 4: FLAML AutoML 학습

Step 5: 모델 평가

예측 vs 실제 시각화

Step 6: 특성 중요도

Step 7: 오차 분석

Step 8: 새 데이터 예측

Step 9: 모델 저장

정리

다음 글 예고