085 종합 프로젝트 - 이미지 특성 기반 분류

키워드: 이미지, 특성 추출, 분류

개요

딥러닝 모델로 이미지에서 특성을 추출하고, FLAML AutoML로 분류 모델을 구축합니다. 전이 학습과 AutoML을 결합한 실용적인 접근법을 알아봅니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], Pillow

pip install flaml[automl] pandas numpy scikit-learn Pillow

이미지 특성 추출 전략

import numpy as np
import pandas as pd

strategies = {
    '방법': ['사전학습 CNN', '전통적 특성', '히스토그램'],
    '설명': [
        'ResNet, VGG 등의 특성 벡터',
        'HOG, SIFT, LBP 등',
        '색상/그레이스케일 분포'
    ],
    '장점': [
        '고수준 특성, 높은 성능',
        '해석 가능, 경량',
        '간단, 빠름'
    ],
    'FLAML 적용': [
        '특성 벡터 → 분류',
        '특성 벡터 → 분류',
        '특성 벡터 → 분류'
    ]
}

print("이미지 특성 추출 전략:")
print(pd.DataFrame(strategies).to_string(index=False))

시뮬레이션 데이터 생성

# 085 실제 이미지 대신 시뮬레이션된 특성 사용
np.random.seed(42)

n_samples = 2000
n_features = 512  # 사전학습 CNN의 출력 크기와 유사

# 3개 클래스 시뮬레이션 (고양이, 개, 새)
n_classes = 3
class_names = ['cat', 'dog', 'bird']

# 085 각 클래스별 특성 분포 생성
class_centers = np.random.randn(n_classes, n_features) * 2
class_labels = []
features = []

for i in range(n_samples):
    class_id = np.random.randint(0, n_classes)
    class_labels.append(class_id)

    # 클래스 중심 + 노이즈
    feature = class_centers[class_id] + np.random.randn(n_features) * 0.5
    features.append(feature)

X = np.array(features)
y = np.array(class_labels)

print(f"특성 형태: {X.shape}")
print(f"클래스 분포: {np.bincount(y)}")
print(f"클래스: {class_names}")

전통적 이미지 특성 시뮬레이션

def simulate_traditional_features(n_samples, class_id):
    """전통적 이미지 특성 시뮬레이션"""
    features = {}

    # 색상 히스토그램 (RGB 각 32 빈)
    if class_id == 0:  # 고양이 - 주로 회색/갈색
        features['color_hist'] = np.random.dirichlet(np.ones(32), 3).flatten()
    elif class_id == 1:  # 개 - 다양한 색상
        features['color_hist'] = np.random.dirichlet(np.ones(32) * 0.5, 3).flatten()
    else:  # 새 - 밝은 색상
        features['color_hist'] = np.random.dirichlet(np.ones(32) * 2, 3).flatten()

    # 텍스처 특성 (LBP-like)
    features['texture'] = np.random.uniform(0, 1, 16) + class_id * 0.1

    # 형태 특성
    features['aspect_ratio'] = np.random.uniform(0.8, 1.2) + class_id * 0.05
    features['compactness'] = np.random.uniform(0.6, 0.9) + class_id * 0.03

    return np.concatenate([features['color_hist'], features['texture'],
                          [features['aspect_ratio'], features['compactness']]])

# 085 전통적 특성 데이터셋
traditional_features = []
for label in y:
    feat = simulate_traditional_features(1, label)
    traditional_features.append(feat)

X_traditional = np.array(traditional_features)
print(f"\n전통적 특성 형태: {X_traditional.shape}")

데이터 분할

from sklearn.model_selection import train_test_split

# 085 CNN 특성 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 085 전통적 특성 분할
X_trad_train, X_trad_test, _, _ = train_test_split(
    X_traditional, y, test_size=0.2, random_state=42, stratify=y
)

print("데이터 분할:")
print(f"  CNN 특성 - 학습: {X_train.shape}, 테스트: {X_test.shape}")
print(f"  전통적 특성 - 학습: {X_trad_train.shape}, 테스트: {X_trad_test.shape}")

FLAML로 CNN 특성 분류

from flaml import AutoML
from sklearn.metrics import accuracy_score, classification_report

# 085 CNN 특성 기반 분류
automl_cnn = AutoML()
automl_cnn.fit(
    X_train, y_train,
    task="classification",
    metric="accuracy",
    time_budget=60,
    estimator_list=['lgbm', 'xgboost', 'rf'],
    n_jobs=-1,
    seed=42,
    verbose=1
)

print(f"\n=== CNN 특성 분류 결과 ===")
print(f"최적 모델: {automl_cnn.best_estimator}")

# 085 평가
y_pred_cnn = automl_cnn.predict(X_test)
print(f"\n테스트 정확도: {accuracy_score(y_test, y_pred_cnn):.4f}")
print("\n분류 리포트:")
print(classification_report(y_test, y_pred_cnn, target_names=class_names))

FLAML로 전통적 특성 분류

# 085 전통적 특성 기반 분류
automl_trad = AutoML()
automl_trad.fit(
    X_trad_train, y_train,
    task="classification",
    metric="accuracy",
    time_budget=60,
    estimator_list=['lgbm', 'xgboost', 'rf'],
    n_jobs=-1,
    seed=42,
    verbose=1
)

print(f"\n=== 전통적 특성 분류 결과 ===")
print(f"최적 모델: {automl_trad.best_estimator}")

# 085 평가
y_pred_trad = automl_trad.predict(X_trad_test)
print(f"\n테스트 정확도: {accuracy_score(y_test, y_pred_trad):.4f}")
print("\n분류 리포트:")
print(classification_report(y_test, y_pred_trad, target_names=class_names))

특성 결합 앙상블

# 085 CNN + 전통적 특성 결합
X_combined_train = np.hstack([X_train, X_trad_train])
X_combined_test = np.hstack([X_test, X_trad_test])

print(f"결합 특성 형태: {X_combined_train.shape}")

# 085 결합 특성 분류
automl_combined = AutoML()
automl_combined.fit(
    X_combined_train, y_train,
    task="classification",
    metric="accuracy",
    time_budget=60,
    estimator_list=['lgbm', 'xgboost', 'rf'],
    n_jobs=-1,
    seed=42,
    verbose=1
)

print(f"\n=== 결합 특성 분류 결과 ===")
y_pred_combined = automl_combined.predict(X_combined_test)
print(f"테스트 정확도: {accuracy_score(y_test, y_pred_combined):.4f}")

성능 비교

# 085 결과 비교
results = {
    '방법': ['CNN 특성', '전통적 특성', '결합 특성'],
    '정확도': [
        accuracy_score(y_test, y_pred_cnn),
        accuracy_score(y_test, y_pred_trad),
        accuracy_score(y_test, y_pred_combined)
    ],
    '최적 모델': [
        automl_cnn.best_estimator,
        automl_trad.best_estimator,
        automl_combined.best_estimator
    ]
}

print("\n=== 방법별 성능 비교 ===")
print(pd.DataFrame(results).to_string(index=False))

특성 중요도 분석

import matplotlib.pyplot as plt

# 085 결합 모델의 특성 중요도
if hasattr(automl_combined.model.estimator, 'feature_importances_'):
    importances = automl_combined.model.estimator.feature_importances_

    # CNN 특성과 전통 특성 분리
    cnn_importance = importances[:X.shape[1]].sum()
    trad_importance = importances[X.shape[1]:].sum()

    print("\n=== 특성 그룹별 중요도 ===")
    print(f"CNN 특성 총 중요도: {cnn_importance:.4f}")
    print(f"전통적 특성 총 중요도: {trad_importance:.4f}")

    # 시각화
    plt.figure(figsize=(8, 5))
    plt.bar(['CNN Features', 'Traditional Features'],
            [cnn_importance, trad_importance], color=['steelblue', 'coral'])
    plt.ylabel('Total Importance')
    plt.title('Feature Group Importance')
    plt.show()

예측 파이프라인

class ImageClassificationPipeline:
    """이미지 분류 파이프라인"""

    def __init__(self, model, class_names):
        self.model = model
        self.class_names = class_names

    def extract_features(self, image_data):
        """특성 추출 (시뮬레이션)"""
        # 실제로는 CNN 모델로 특성 추출
        return np.random.randn(512)

    def predict(self, features):
        """클래스 예측"""
        pred = self.model.predict(features.reshape(1, -1))[0]
        return self.class_names[pred]

    def predict_proba(self, features):
        """확률 예측"""
        proba = self.model.predict_proba(features.reshape(1, -1))[0]
        return dict(zip(self.class_names, proba))

# 085 파이프라인 테스트
pipeline = ImageClassificationPipeline(automl_cnn, class_names)

# 085 테스트 샘플
test_features = X_test[0]
predicted_class = pipeline.predict(test_features)
probabilities = pipeline.predict_proba(test_features)

print("\n=== 예측 파이프라인 테스트 ===")
print(f"예측 클래스: {predicted_class}")
print(f"실제 클래스: {class_names[y_test[0]]}")
print(f"예측 확률: {probabilities}")

실제 CNN 특성 추출 예시 (참고)

# 085 실제 CNN 특성 추출 코드 (참고용)
example_code = """
# 085 실제 이미지에서 특성 추출
import torch
from torchvision import models, transforms
from PIL import Image

# 085 사전학습된 ResNet 로드
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])  # 마지막 FC 제거
resnet.eval()

# 085 이미지 전처리
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

def extract_features(image_path):
    image = Image.open(image_path).convert('RGB')
    image_tensor = transform(image).unsqueeze(0)

    with torch.no_grad():
        features = resnet(image_tensor)

    return features.squeeze().numpy()

# 085 특성 추출 후 FLAML로 분류
# 085 features = extract_features('image.jpg')
# 085 prediction = automl.predict(features.reshape(1, -1))
"""

print("실제 CNN 특성 추출 예시 코드:")
print(example_code)

모델 저장

import joblib

# 085 분류 모델 저장
joblib.dump({
    'cnn_model': automl_cnn,
    'traditional_model': automl_trad,
    'combined_model': automl_combined,
    'class_names': class_names
}, 'image_classifier.pkl')

print("\n모델 저장: image_classifier.pkl")

정리

특성 추출: CNN 사전학습 모델 또는 전통적 방법
FLAML 분류: 추출된 특성으로 AutoML 학습
특성 결합: CNN + 전통적 특성 앙상블
파이프라인: 특성 추출 → 분류 자동화
CNN 특성이 일반적으로 높은 성능

다음 글 예고

다음 글에서는 텍스트 분류 (TF-IDF + FLAML) 프로젝트를 진행합니다. 텍스트 데이터를 벡터화하여 FLAML로 분류하는 방법을 알아봅니다.

FLAML AutoML 마스터 시리즈 #085

개요​

실습 환경​

이미지 특성 추출 전략​

시뮬레이션 데이터 생성​

전통적 이미지 특성 시뮬레이션​

데이터 분할​

FLAML로 CNN 특성 분류​

FLAML로 전통적 특성 분류​

특성 결합 앙상블​

성능 비교​

특성 중요도 분석​

예측 파이프라인​

실제 CNN 특성 추출 예시 (참고)​

모델 저장​

정리​

다음 글 예고​

개요