042 회귀 첫 번째 예제 - 주택 가격 예측

키워드: 회귀, 주택 가격

개요

주택 가격 예측은 회귀의 대표적인 예제입니다. 이 글에서는 보스턴 주택 데이터로 처음부터 끝까지 회귀 모델을 구축합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pycaret[full]>=3.0

데이터 로드 및 탐색

from pycaret.regression import *
from pycaret.datasets import get_data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 042 데이터 로드
data = get_data('boston')

print(f"데이터 크기: {data.shape}")
print(f"\n컬럼 정보:")
print(data.columns.tolist())

데이터 설명

컬럼	설명
crim	범죄율
zn	주거 지역 비율
indus	비소매 상업 지역 비율
chas	찰스 강 인접 여부
nox	질소산화물 농도
rm	평균 방 개수
age	1940년 이전 건물 비율
dis	직장까지 거리
rad	고속도로 접근성
tax	재산세율
ptratio	학생-교사 비율
lstat	저소득층 비율
medv	중간 주택 가격 (타겟)

탐색적 데이터 분석

# 042 기초 통계
print(data.describe())

# 042 타겟 변수 분포
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
data['medv'].hist(bins=30)
plt.title('Price Distribution')
plt.xlabel('Price ($1000)')

plt.subplot(1, 3, 2)
data.boxplot(column='medv')
plt.title('Price Boxplot')

plt.subplot(1, 3, 3)
import scipy.stats as stats
stats.probplot(data['medv'], plot=plt)
plt.title('Q-Q Plot')

plt.tight_layout()
plt.savefig('boston_target_analysis.png', dpi=150)

특성과 타겟 관계

# 042 주요 특성과 가격의 관계
important_features = ['rm', 'lstat', 'ptratio', 'dis']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for idx, feature in enumerate(important_features):
    axes[idx].scatter(data[feature], data['medv'], alpha=0.5)
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Price')
    axes[idx].set_title(f'{feature} vs Price')

plt.tight_layout()
plt.savefig('boston_feature_relations.png', dpi=150)

PyCaret 환경 설정

# 042 환경 설정
reg = setup(
    data=data,
    target='medv',
    session_id=42,
    verbose=False
)

print("Setup 완료!")

모델 비교

# 042 모든 모델 비교
print("=== 모델 비교 ===")
best_models = compare_models(n_select=5)

# 042 상위 5개 모델 확인
print("\n상위 5개 모델:")
for i, model in enumerate(best_models, 1):
    print(f"{i}. {type(model).__name__}")

최고 모델 튜닝

# 042 최고 모델 튜닝
print("\n=== 모델 튜닝 ===")
tuned_model = tune_model(best_models[0], optimize='RMSE')

앙상블

# 042 블렌딩
print("\n=== 앙상블 (Blending) ===")
blended = blend_models(best_models[:3])

모델 평가

# 042 잔차 분석
plot_model(blended, plot='residuals', save=True)

# 042 예측 vs 실제
plot_model(blended, plot='error', save=True)

# 042 특성 중요도
plot_model(best_models[0], plot='feature', save=True)

# 042 학습 곡선
plot_model(best_models[0], plot='learning', save=True)

예측 결과 분석

# 042 테스트 데이터 예측
predictions = predict_model(blended)

# 042 예측 결과 확인
print("\n예측 결과 샘플:")
print(predictions[['medv', 'prediction_label']].head(10))

# 042 예측 오차 분석
predictions['error'] = predictions['medv'] - predictions['prediction_label']
predictions['abs_error'] = predictions['error'].abs()
predictions['pct_error'] = (predictions['error'] / predictions['medv'] * 100).abs()

print(f"\n평균 절대 오차: ${predictions['abs_error'].mean()*1000:.0f}")
print(f"평균 오차율: {predictions['pct_error'].mean():.1f}%")

최종 모델 저장

# 042 전체 데이터로 재학습
final_model = finalize_model(blended)

# 042 모델 저장
save_model(final_model, 'boston_house_price_model')
print("모델 저장 완료: boston_house_price_model.pkl")

새 데이터 예측

# 042 모델 로드
loaded_model = load_model('boston_house_price_model')

# 042 새 주택 데이터
new_house = pd.DataFrame({
    'crim': [0.05],
    'zn': [18.0],
    'indus': [5.0],
    'chas': [0],
    'nox': [0.45],
    'rm': [6.5],
    'age': [40.0],
    'dis': [5.0],
    'rad': [4],
    'tax': [300],
    'ptratio': [18.0],
    'lstat': [8.0]
})

# 042 예측
prediction = predict_model(loaded_model, data=new_house)
predicted_price = prediction['prediction_label'].values[0]

print(f"\n=== 새 주택 가격 예측 ===")
print(f"예측 가격: ${predicted_price * 1000:,.0f}")

전체 워크플로우 요약

from pycaret.regression import *
from pycaret.datasets import get_data

# 1. 데이터 로드
data = get_data('boston')

# 2. 환경 설정
reg = setup(data, target='medv', session_id=42, verbose=False)

# 3. 모델 비교
best = compare_models(n_select=3)

# 4. 튜닝
tuned = tune_model(best[0])

# 5. 앙상블
blended = blend_models(best)

# 6. 최종화 및 저장
final = finalize_model(blended)
save_model(final, 'boston_model')

print("회귀 워크플로우 완료!")

정리

회귀 워크플로우: setup → compare → tune → blend → finalize
분류와 동일한 함수, 다른 평가 지표
잔차 분석으로 모델 품질 진단
특성 중요도로 영향력 있는 변수 파악
저장된 모델로 새 데이터 예측

다음 글 예고

다음 글에서는 회귀 평가 지표 - MAE, MSE, RMSE를 다룹니다.

PyCaret 머신러닝 마스터 시리즈 #042

개요​

실습 환경​

데이터 로드 및 탐색​

데이터 설명​

탐색적 데이터 분석​

특성과 타겟 관계​

PyCaret 환경 설정​

모델 비교​

최고 모델 튜닝​

앙상블​

모델 평가​

예측 결과 분석​

최종 모델 저장​

새 데이터 예측​

전체 워크플로우 요약​

정리​

다음 글 예고​

개요

실습 환경

데이터 로드 및 탐색

데이터 설명

탐색적 데이터 분석

특성과 타겟 관계

PyCaret 환경 설정

모델 비교

최고 모델 튜닝

앙상블

모델 평가

예측 결과 분석

최종 모델 저장

새 데이터 예측

전체 워크플로우 요약

정리

다음 글 예고