043 회귀 프로젝트 - 자동차 가격 예측

키워드: 자동차 가격, 회귀, 실전 프로젝트

개요

자동차 가격 예측은 다양한 특성(브랜드, 연식, 주행거리 등)을 활용한 대표적인 회귀 문제입니다. 이 글에서는 중고차 가격 예측 모델을 개발합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], pandas, scikit-learn

pip install flaml[automl] pandas scikit-learn matplotlib seaborn

프로젝트 개요

목표

중고차의 특성을 기반으로 판매 가격 예측

활용 분야

중고차 딜러 가격 책정
소비자 적정 가격 확인
자동차 감가상각 분석

Step 1: 데이터 준비

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 043 가상의 중고차 데이터 생성
np.random.seed(42)
n_samples = 5000

# 043 데이터 생성
brands = ['Toyota', 'Honda', 'BMW', 'Mercedes', 'Ford', 'Hyundai', 'Kia']
fuel_types = ['Gasoline', 'Diesel', 'Hybrid', 'Electric']
transmissions = ['Automatic', 'Manual']

data = {
    'brand': np.random.choice(brands, n_samples, p=[0.2, 0.15, 0.1, 0.1, 0.15, 0.15, 0.15]),
    'year': np.random.randint(2010, 2024, n_samples),
    'mileage': np.random.exponential(50000, n_samples).astype(int),
    'fuel_type': np.random.choice(fuel_types, n_samples, p=[0.5, 0.25, 0.15, 0.1]),
    'transmission': np.random.choice(transmissions, n_samples, p=[0.7, 0.3]),
    'engine_size': np.random.uniform(1.0, 4.0, n_samples).round(1),
    'horsepower': np.random.randint(100, 400, n_samples),
    'seats': np.random.choice([4, 5, 7], n_samples, p=[0.2, 0.6, 0.2]),
    'previous_owners': np.random.choice([1, 2, 3, 4], n_samples, p=[0.4, 0.35, 0.2, 0.05]),
}

df = pd.DataFrame(data)

# 043 가격 생성 (특성 기반)
base_price = 30000
brand_premium = {'Toyota': 1.0, 'Honda': 0.95, 'BMW': 1.5, 'Mercedes': 1.6, 'Ford': 0.9, 'Hyundai': 0.85, 'Kia': 0.8}
df['price'] = base_price * df['brand'].map(brand_premium)
df['price'] = df['price'] * (1 - (2024 - df['year']) * 0.08)  # 연식 감가
df['price'] = df['price'] * (1 - df['mileage'] / 500000)  # 주행거리 감가
df['price'] = df['price'] * (1 + df['horsepower'] / 1000)  # 마력 프리미엄
df['price'] = df['price'] * np.where(df['fuel_type'] == 'Electric', 1.3, 1.0)  # 전기차 프리미엄
df['price'] = df['price'] * (1 - df['previous_owners'] * 0.05)  # 소유자 수 감가
df['price'] = df['price'] + np.random.randn(n_samples) * 2000  # 노이즈
df['price'] = df['price'].clip(5000, 100000)  # 범위 제한

print("데이터셋 정보:")
print(f"  샘플 수: {len(df)}")
print(f"  특성 수: {df.shape[1] - 1}")
print(f"\n처음 5행:")
print(df.head())

Step 2: 탐색적 데이터 분석

가격 분포

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 043 히스토그램
axes[0].hist(df['price'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Price ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Price Distribution')

# 043 박스플롯 (브랜드별)
df.boxplot(column='price', by='brand', ax=axes[1])
axes[1].set_xlabel('Brand')
axes[1].set_ylabel('Price ($)')
axes[1].set_title('Price by Brand')
plt.suptitle('')

plt.tight_layout()
plt.show()

특성별 분석

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 043 연식 vs 가격
axes[0, 0].scatter(df['year'], df['price'], alpha=0.3, s=10)
axes[0, 0].set_xlabel('Year')
axes[0, 0].set_ylabel('Price ($)')
axes[0, 0].set_title('Year vs Price')

# 043 주행거리 vs 가격
axes[0, 1].scatter(df['mileage'], df['price'], alpha=0.3, s=10)
axes[0, 1].set_xlabel('Mileage')
axes[0, 1].set_ylabel('Price ($)')
axes[0, 1].set_title('Mileage vs Price')

# 043 마력 vs 가격
axes[1, 0].scatter(df['horsepower'], df['price'], alpha=0.3, s=10)
axes[1, 0].set_xlabel('Horsepower')
axes[1, 0].set_ylabel('Price ($)')
axes[1, 0].set_title('Horsepower vs Price')

# 043 연료 타입별 가격
df.boxplot(column='price', by='fuel_type', ax=axes[1, 1])
axes[1, 1].set_xlabel('Fuel Type')
axes[1, 1].set_ylabel('Price ($)')
axes[1, 1].set_title('Price by Fuel Type')
plt.suptitle('')

plt.tight_layout()
plt.show()

상관관계

# 043 수치형 특성만 선택
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

print("가격과의 상관관계:")
print(correlation['price'].sort_values(ascending=False))

Step 3: 데이터 전처리

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 043 범주형 인코딩
df_encoded = df.copy()
label_encoders = {}

categorical_cols = ['brand', 'fuel_type', 'transmission']
for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

# 043 특성과 타겟 분리
X = df_encoded.drop('price', axis=1)
y = df_encoded['price']

# 043 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"학습 데이터: {X_train.shape}")
print(f"테스트 데이터: {X_test.shape}")

Step 4: FLAML AutoML 학습

from flaml import AutoML

# 043 FLAML 학습
automl = AutoML()
automl.fit(
    X_train, y_train,
    task="regression",
    time_budget=120,
    metric="r2",
    seed=42,
    verbose=1
)

print(f"\n최적 모델: {automl.best_estimator}")
print(f"검증 R²: {1 - automl.best_loss:.4f}")

Step 5: 모델 평가

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 043 예측
y_pred = automl.predict(X_test)

# 043 평가 지표
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("테스트 성능:")
print(f"  RMSE: ${rmse:,.0f}")
print(f"  MAE: ${mae:,.0f}")
print(f"  R²: {r2:.4f}")
print(f"  MAPE: {mape:.2f}%")

예측 시각화

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# 043 실제 vs 예측
axes[0].scatter(y_test, y_pred, alpha=0.3, s=10)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
axes[0].set_xlabel('Actual Price ($)')
axes[0].set_ylabel('Predicted Price ($)')
axes[0].set_title(f'Actual vs Predicted (R² = {r2:.4f})')

# 043 잔차 분포
residuals = y_test - y_pred
axes[1].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[1].axvline(x=0, color='r', linestyle='--')
axes[1].set_xlabel('Residual ($)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Residual Distribution')

plt.tight_layout()
plt.show()

Step 6: 특성 중요도

if hasattr(automl.best_model, 'feature_importances_'):
    importance = automl.best_model.feature_importances_
    feature_names = X.columns

    # 정렬
    sorted_idx = np.argsort(importance)[::-1]

    plt.figure(figsize=(10, 6))
    plt.bar(range(len(importance)), importance[sorted_idx])
    plt.xticks(range(len(importance)), feature_names[sorted_idx], rotation=45, ha='right')
    plt.xlabel('Feature')
    plt.ylabel('Importance')
    plt.title('Feature Importance for Car Price Prediction')
    plt.tight_layout()
    plt.show()

    print("특성 중요도:")
    for i in sorted_idx[:5]:
        print(f"  {feature_names[i]}: {importance[i]:.4f}")

Step 7: 브랜드별 성능 분석

# 043 브랜드별 예측 오차
df_test = X_test.copy()
df_test['actual'] = y_test.values
df_test['predicted'] = y_pred
df_test['error'] = df_test['actual'] - df_test['predicted']
df_test['abs_error'] = np.abs(df_test['error'])
df_test['brand_name'] = df_test['brand'].map(dict(enumerate(label_encoders['brand'].classes_)))

# 043 브랜드별 MAE
brand_mae = df_test.groupby('brand_name')['abs_error'].mean().sort_values()

plt.figure(figsize=(10, 6))
brand_mae.plot(kind='barh')
plt.xlabel('Mean Absolute Error ($)')
plt.ylabel('Brand')
plt.title('Prediction Error by Brand')
plt.tight_layout()
plt.show()

print("브랜드별 MAE:")
print(brand_mae.round(0))

Step 8: 새 차량 가격 예측

def predict_car_price(car_data, model, encoders):
    """새 차량 가격 예측"""
    df_new = pd.DataFrame([car_data])

    # 인코딩
    for col in ['brand', 'fuel_type', 'transmission']:
        if col in df_new.columns:
            df_new[col] = encoders[col].transform(df_new[col])

    # 예측
    price = model.predict(df_new)[0]
    return price

# 043 테스트 차량
test_cars = [
    {
        'brand': 'Toyota', 'year': 2020, 'mileage': 30000,
        'fuel_type': 'Hybrid', 'transmission': 'Automatic',
        'engine_size': 2.0, 'horsepower': 180, 'seats': 5, 'previous_owners': 1
    },
    {
        'brand': 'BMW', 'year': 2018, 'mileage': 60000,
        'fuel_type': 'Gasoline', 'transmission': 'Automatic',
        'engine_size': 3.0, 'horsepower': 300, 'seats': 5, 'previous_owners': 2
    },
    {
        'brand': 'Hyundai', 'year': 2022, 'mileage': 15000,
        'fuel_type': 'Electric', 'transmission': 'Automatic',
        'engine_size': 0.0, 'horsepower': 200, 'seats': 5, 'previous_owners': 1
    }
]

print("새 차량 가격 예측:")
print("-" * 60)
for i, car in enumerate(test_cars, 1):
    price = predict_car_price(car, automl, label_encoders)
    print(f"\n차량 {i}: {car['brand']} {car['year']}")
    print(f"  주행거리: {car['mileage']:,}km")
    print(f"  연료: {car['fuel_type']}, 마력: {car['horsepower']}hp")
    print(f"  예상 가격: ${price:,.0f}")

Step 9: 모델 저장

import pickle

# 043 모델과 인코더 저장
model_package = {
    'model': automl,
    'encoders': label_encoders,
    'features': X.columns.tolist()
}

with open('car_price_model.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print("모델 저장 완료: car_price_model.pkl")

정리

자동차 가격 예측에서 연식, 주행거리, 브랜드가 중요한 특성입니다.
R² = 0.90+ 수준의 좋은 예측 성능을 달성했습니다.
범주형 변수(브랜드, 연료)의 적절한 인코딩이 중요합니다.
MAPE로 평균 예측 오차율을 확인할 수 있습니다.
브랜드별 성능 분석으로 모델 신뢰도를 파악할 수 있습니다.

다음 글 예고

다음 글에서는 회귀 프로젝트 - 매출 예측에 대해 알아보겠습니다. 시계열적 특성을 가진 매출 데이터를 예측합니다.

FLAML AutoML 마스터 시리즈 #043

개요​

실습 환경​

프로젝트 개요​

목표​

활용 분야​

Step 1: 데이터 준비​

Step 2: 탐색적 데이터 분석​

가격 분포​

특성별 분석​

상관관계​

Step 3: 데이터 전처리​

Step 4: FLAML AutoML 학습​

Step 5: 모델 평가​

예측 시각화​

Step 6: 특성 중요도​

Step 7: 브랜드별 성능 분석​

Step 8: 새 차량 가격 예측​

Step 9: 모델 저장​

정리​

다음 글 예고​

개요

실습 환경

프로젝트 개요

목표

활용 분야

Step 1: 데이터 준비

Step 2: 탐색적 데이터 분석

가격 분포

특성별 분석

상관관계

Step 3: 데이터 전처리

Step 4: FLAML AutoML 학습

Step 5: 모델 평가

예측 시각화

Step 6: 특성 중요도

Step 7: 브랜드별 성능 분석

Step 8: 새 차량 가격 예측

Step 9: 모델 저장

정리

다음 글 예고