070 BlendSearch 알고리즘 이해

키워드: BlendSearch, 탐색 알고리즘, 하이퍼파라미터 최적화

개요

BlendSearch는 FLAML의 핵심 하이퍼파라미터 탐색 알고리즘입니다. 글로벌 탐색과 로컬 탐색을 결합하여 효율적으로 최적 설정을 찾습니다. 이 글에서는 BlendSearch의 원리와 활용법을 알아봅니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl]

pip install flaml[automl] pandas numpy matplotlib

BlendSearch란?

import numpy as np
import pandas as pd
from flaml import AutoML, tune

print("BlendSearch 핵심 개념:")
print("=" * 50)

concepts = {
    '구성요소': ['글로벌 탐색', '로컬 탐색', '경제적 평가'],
    '역할': [
        '전체 탐색 공간 탐험 (Exploration)',
        '유망 영역 집중 (Exploitation)',
        '저비용 설정 우선 평가'
    ],
    '기술': [
        'Bayesian Optimization',
        'CFO (Cost-Frugal Optimization)',
        'Early stopping + 비용 인식'
    ]
}

print(pd.DataFrame(concepts).to_string(index=False))

글로벌 vs 로컬 탐색

import matplotlib.pyplot as plt

# 070 탐색 전략 시각화
np.random.seed(42)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# 1. 글로벌 탐색 (Random/Bayesian)
ax = axes[0]
x_global = np.random.uniform(0, 10, 50)
y_global = np.random.uniform(0, 10, 50)
ax.scatter(x_global, y_global, alpha=0.6)
ax.set_title('Global Search\n(Exploration)')
ax.set_xlabel('Param 1')
ax.set_ylabel('Param 2')

# 2. 로컬 탐색 (CFO)
ax = axes[1]
center = (5, 5)
x_local = center[0] + np.random.randn(50) * 0.5
y_local = center[1] + np.random.randn(50) * 0.5
ax.scatter(x_local, y_local, alpha=0.6, color='orange')
ax.scatter(*center, color='red', s=100, marker='*', label='Best found')
ax.set_title('Local Search\n(Exploitation)')
ax.set_xlabel('Param 1')
ax.set_ylabel('Param 2')
ax.legend()

# 3. BlendSearch (결합)
ax = axes[2]
# 070 글로벌 포인트
x_blend_g = np.random.uniform(0, 10, 20)
y_blend_g = np.random.uniform(0, 10, 20)
# 070 로컬 포인트 (여러 유망 지역)
centers = [(3, 7), (7, 3)]
for c in centers:
    x_l = c[0] + np.random.randn(15) * 0.5
    y_l = c[1] + np.random.randn(15) * 0.5
    ax.scatter(x_l, y_l, alpha=0.6, color='orange')
ax.scatter(x_blend_g, y_blend_g, alpha=0.4, color='blue', label='Global')
ax.scatter([c[0] for c in centers], [c[1] for c in centers],
           color='red', s=100, marker='*', label='Local centers')
ax.set_title('BlendSearch\n(Global + Local)')
ax.set_xlabel('Param 1')
ax.set_ylabel('Param 2')
ax.legend()

plt.tight_layout()
plt.show()

print("\nBlendSearch 장점:")
print("  - 글로벌: 전체 공간 탐험으로 좋은 영역 발견")
print("  - 로컬: 발견된 영역에서 세밀한 최적화")
print("  - 결합: 두 장점을 모두 활용")

BlendSearch 작동 원리

print("BlendSearch 작동 순서:")
print("=" * 50)

steps = {
    '단계': ['1. 초기화', '2. 글로벌 탐색', '3. 유망 영역 선정', '4. 로컬 탐색', '5. 반복'],
    '설명': [
        '저비용 초기 설정으로 시작',
        'Bayesian Optimization으로 전체 공간 탐색',
        '좋은 성능을 보인 영역 식별',
        'CFO로 유망 영역 집중 탐색',
        '시간 예산까지 글로벌/로컬 교대'
    ],
    '비용': ['낮음', '중간', '-', '낮음~중간', '동적']
}

print(pd.DataFrame(steps).to_string(index=False))

FLAML에서 BlendSearch 사용

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# 070 데이터 준비
X, y = make_classification(n_samples=2000, n_features=20,
                           n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 070 기본 FLAML (BlendSearch 자동 사용)
automl = AutoML()
automl.fit(
    X_train, y_train,
    task="classification",
    time_budget=60,
    verbose=2  # 상세 로그
)

print(f"\n최적 모델: {automl.best_estimator}")
print(f"최적 설정: {automl.best_config}")
print(f"탐색 반복 수: {len(automl.config_history)}")

탐색 히스토리 분석

def analyze_search_history(automl):
    """BlendSearch 탐색 히스토리 분석"""
    history = automl.config_history

    if not history:
        print("탐색 히스토리가 비어있습니다.")
        return

    # 히스토리 데이터 추출
    iterations = []
    losses = []
    estimators = []

    for iter_id, config_info in history.items():
        iterations.append(iter_id)
        losses.append(config_info.get('val_loss', None))
        estimators.append(config_info.get('learner', 'unknown'))

    # 시각화
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # 손실 추이
    valid_losses = [l for l in losses if l is not None]
    if valid_losses:
        ax = axes[0]
        ax.plot(range(len(valid_losses)), valid_losses, 'b-', alpha=0.7)
        ax.scatter(range(len(valid_losses)), valid_losses, c='blue', s=20)

        # 최소값 표시
        min_idx = np.argmin(valid_losses)
        ax.scatter(min_idx, valid_losses[min_idx], c='red', s=100, marker='*',
                  zorder=5, label=f'Best: {valid_losses[min_idx]:.4f}')
        ax.set_xlabel('Iteration')
        ax.set_ylabel('Validation Loss')
        ax.set_title('Search Progress')
        ax.legend()

    # 모델별 탐색 횟수
    ax = axes[1]
    from collections import Counter
    est_counts = Counter(estimators)
    ax.bar(est_counts.keys(), est_counts.values())
    ax.set_xlabel('Estimator')
    ax.set_ylabel('Count')
    ax.set_title('Estimator Exploration Count')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()

    print(f"\n탐색 통계:")
    print(f"  총 반복: {len(iterations)}")
    print(f"  최적 손실: {min(valid_losses):.4f}")
    print(f"  탐색된 모델: {dict(est_counts)}")

analyze_search_history(automl)

tune.run으로 BlendSearch 직접 사용

from flaml import tune
from flaml.tune.searcher import BlendSearch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def evaluate_config(config):
    """설정 평가 함수"""
    model = RandomForestClassifier(
        n_estimators=config['n_estimators'],
        max_depth=config['max_depth'],
        min_samples_split=config['min_samples_split'],
        random_state=42
    )

    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    return {'accuracy': scores.mean()}

# 070 탐색 공간
search_space = {
    'n_estimators': tune.randint(50, 300),
    'max_depth': tune.randint(3, 20),
    'min_samples_split': tune.randint(2, 20),
}

# 070 BlendSearch로 최적화
analysis = tune.run(
    evaluate_config,
    config=search_space,
    metric='accuracy',
    mode='max',
    num_samples=50,
    time_budget_s=60,
    verbose=1,
)

print(f"\n최적 설정: {analysis.best_config}")
print(f"최적 정확도: {analysis.best_result['accuracy']:.4f}")

BlendSearch 파라미터 조정

# 070 BlendSearch 세부 설정
from flaml.tune.searcher.blendsearch import BlendSearch

# 070 커스텀 BlendSearch 설정
searcher = BlendSearch(
    metric='accuracy',
    mode='max',
    space=search_space,
    low_cost_partial_config={
        'n_estimators': 50,  # 저비용 초기값
        'max_depth': 3,
    },
    points_to_evaluate=[
        {'n_estimators': 100, 'max_depth': 6, 'min_samples_split': 2},  # 알려진 좋은 설정
    ],
)

analysis_custom = tune.run(
    evaluate_config,
    config=search_space,
    search_alg=searcher,
    num_samples=30,
    time_budget_s=60,
)

print(f"커스텀 BlendSearch 결과: {analysis_custom.best_config}")

글로벌/로컬 비율 조정

print("BlendSearch 비율 조정:")
print("=" * 50)

ratios = {
    '설정': ['기본', '탐험 중심', '활용 중심'],
    '글로벌:로컬': ['자동 조정', '높은 글로벌', '높은 로컬'],
    '적합한 상황': [
        '일반적인 경우',
        '탐색 공간이 크거나 불확실할 때',
        '이미 좋은 영역을 알고 있을 때'
    ],
    '방법': [
        'FLAML 기본 사용',
        'num_samples 증가',
        'points_to_evaluate 활용'
    ]
}

print(pd.DataFrame(ratios).to_string(index=False))

비용 인식 탐색

print("\n비용 인식 탐색 (Cost-Aware Search):")
print("=" * 50)

# 070 비용 함수 예시
def get_training_cost(config):
    """학습 비용 추정"""
    n_est = config.get('n_estimators', 100)
    depth = config.get('max_depth', 10)
    # 트리 수와 깊이에 비례하는 비용
    return n_est * (2 ** min(depth, 10))

# 070 비용 순 정렬 예시
configs = [
    {'n_estimators': 50, 'max_depth': 3},
    {'n_estimators': 100, 'max_depth': 6},
    {'n_estimators': 200, 'max_depth': 10},
    {'n_estimators': 300, 'max_depth': 15},
]

costs = [(c, get_training_cost(c)) for c in configs]
costs.sort(key=lambda x: x[1])

print("설정별 추정 비용:")
for config, cost in costs:
    print(f"  {config}: {cost:,}")

print("\nBlendSearch는 저비용 설정을 먼저 평가하여 빠르게 기준선 확보")

BlendSearch vs 다른 알고리즘

comparison = {
    '알고리즘': ['Random Search', 'Grid Search', 'Bayesian Opt', 'BlendSearch'],
    '탐색 전략': ['무작위', '격자 기반', '확률 모델', '글로벌+로컬'],
    '효율성': ['낮음', '매우 낮음', '높음', '매우 높음'],
    '비용 인식': ['없음', '없음', '부분적', '있음'],
    '조기 종료': ['없음', '없음', '가능', '내장']
}

print("\nBlendSearch vs 다른 알고리즘:")
print(pd.DataFrame(comparison).to_string(index=False))

실전 팁

tips = {
    '상황': [
        '시간 제약 심함',
        '큰 탐색 공간',
        '도메인 지식 있음',
        '재현성 필요'
    ],
    '권장 설정': [
        'low_cost_init_value 설정',
        'time_budget 충분히 확보',
        'points_to_evaluate 활용',
        'seed 고정'
    ],
    '효과': [
        '빠른 기준선 확보',
        '충분한 탐색',
        '좋은 영역에서 시작',
        '결과 재현 가능'
    ]
}

print("\nBlendSearch 실전 팁:")
print(pd.DataFrame(tips).to_string(index=False))

디버깅과 모니터링

# 070 verbose로 탐색 과정 확인
automl_debug = AutoML()
automl_debug.fit(
    X_train, y_train,
    task="classification",
    time_budget=30,
    verbose=3,  # 최대 상세도
)

# 070 탐색 로그 분석
print("\n탐색 로그 분석:")
if hasattr(automl_debug, 'config_history'):
    for iter_id, info in list(automl_debug.config_history.items())[:5]:
        print(f"\nIteration {iter_id}:")
        print(f"  Learner: {info.get('learner', 'N/A')}")
        print(f"  Val Loss: {info.get('val_loss', 'N/A')}")

정리

BlendSearch: 글로벌 탐색 + 로컬 탐색 결합
글로벌 탐색: Bayesian Optimization으로 전체 공간 탐험
로컬 탐색: CFO로 유망 영역 집중 탐색
비용 인식: 저비용 설정 우선 평가
자동 균형: 탐험과 활용 자동 조절
효율성: 제한된 시간에 좋은 설정 빠르게 발견

다음 글 예고

다음 글에서는 CFO(Cost-Frugal Optimization) 이해에 대해 알아보겠습니다. BlendSearch의 로컬 탐색에 사용되는 CFO 알고리즘을 상세히 다룹니다.

FLAML AutoML 마스터 시리즈 #070

개요​

실습 환경​

BlendSearch란?​

글로벌 vs 로컬 탐색​

BlendSearch 작동 원리​

FLAML에서 BlendSearch 사용​

탐색 히스토리 분석​

tune.run으로 BlendSearch 직접 사용​

BlendSearch 파라미터 조정​

글로벌/로컬 비율 조정​

비용 인식 탐색​

BlendSearch vs 다른 알고리즘​

실전 팁​

디버깅과 모니터링​

정리​

다음 글 예고​

개요