069 탐색 공간(Search Space) 직접 정의하기

키워드: Search Space, 탐색 공간, 하이퍼파라미터

개요

FLAML의 기본 탐색 공간을 사용자 요구에 맞게 수정하면 더 효율적인 하이퍼파라미터 최적화가 가능합니다. 이 글에서는 탐색 공간을 직접 정의하고 커스터마이징하는 방법을 알아봅니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl]

pip install flaml[automl] pandas numpy

탐색 공간 기본 개념

import numpy as np
import pandas as pd
from flaml import AutoML, tune
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split

# 069 데이터 준비
X, y = make_classification(n_samples=2000, n_features=20,
                           n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("탐색 공간(Search Space)이란?")
print("  - 하이퍼파라미터가 탐색될 범위")
print("  - 각 파라미터의 타입(연속, 이산, 범주형)")
print("  - 초기값과 우선순위")

FLAML 탐색 공간 형식

기본 형식

# 069 FLAML 탐색 공간 예시
example_space = {
    'param_name': {
        'domain': [...],           # 탐색 범위 (필수)
        'init_value': value,       # 초기값 (선택)
        'low_cost_init_value': value,  # 저비용 초기값 (선택)
    }
}

# 069 실제 예시
lgbm_custom_space = {
    'n_estimators': {
        'domain': list(range(50, 500, 10)),
        'init_value': 100,
        'low_cost_init_value': 50,
    },
    'learning_rate': {
        'domain': np.logspace(-3, 0, 100).tolist(),
        'init_value': 0.1,
    },
    'max_depth': {
        'domain': list(range(3, 15)),
        'init_value': 6,
        'low_cost_init_value': 3,
    },
}

print("탐색 공간 구성 요소:")
print("  - domain: 가능한 값들의 리스트")
print("  - init_value: 탐색 시작점")
print("  - low_cost_init_value: 빠른 평가용 초기값")

domain 타입별 정의

# 1. 연속형 (선형 스케일)
linear_space = {
    'param': {
        'domain': np.linspace(0.1, 1.0, 100).tolist(),
        'init_value': 0.5,
    }
}

# 2. 연속형 (로그 스케일)
log_space = {
    'param': {
        'domain': np.logspace(-5, 0, 100).tolist(),  # 1e-5 ~ 1
        'init_value': 0.01,
    }
}

# 3. 정수형
integer_space = {
    'param': {
        'domain': list(range(1, 100)),
        'init_value': 10,
    }
}

# 4. 범주형
categorical_space = {
    'param': {
        'domain': ['option_a', 'option_b', 'option_c'],
        'init_value': 'option_a',
    }
}

# 5. 불리언
boolean_space = {
    'param': {
        'domain': [True, False],
        'init_value': True,
    }
}

print("domain 타입:")
print("  - 연속형: np.linspace 또는 np.logspace 사용")
print("  - 정수형: list(range()) 사용")
print("  - 범주형: 문자열 리스트")
print("  - 불리언: [True, False]")

custom_hp로 기존 모델 공간 수정

# 069 LightGBM 커스텀 탐색 공간
lgbm_custom = {
    'n_estimators': {
        'domain': list(range(100, 1000, 50)),  # 더 많은 트리
        'init_value': 200,
        'low_cost_init_value': 100,
    },
    'num_leaves': {
        'domain': list(range(20, 200, 10)),  # 더 많은 리프
        'init_value': 31,
        'low_cost_init_value': 20,
    },
    'learning_rate': {
        'domain': np.logspace(-3, -1, 50).tolist(),  # 작은 학습률
        'init_value': 0.05,
    },
    'feature_fraction': {
        'domain': np.linspace(0.5, 1.0, 20).tolist(),
        'init_value': 0.9,
    },
    'bagging_fraction': {
        'domain': np.linspace(0.5, 1.0, 20).tolist(),
        'init_value': 0.9,
    },
}

# 069 custom_hp로 적용
automl = AutoML()
automl.fit(
    X_train, y_train,
    task="classification",
    time_budget=60,
    estimator_list=['lgbm'],
    custom_hp={'lgbm': lgbm_custom},  # 커스텀 탐색 공간
    verbose=1
)

print(f"\n최적 설정: {automl.best_config}")

여러 모델 탐색 공간 동시 정의

# 069 여러 모델의 탐색 공간
multi_custom_hp = {
    'lgbm': {
        'n_estimators': {
            'domain': list(range(100, 500, 50)),
            'init_value': 100,
        },
        'learning_rate': {
            'domain': np.logspace(-3, 0, 30).tolist(),
            'init_value': 0.1,
        },
    },
    'rf': {
        'n_estimators': {
            'domain': list(range(50, 300, 25)),
            'init_value': 100,
        },
        'max_depth': {
            'domain': list(range(5, 30, 5)) + [0],  # 0 = None
            'init_value': 10,
        },
    },
    'xgboost': {
        'n_estimators': {
            'domain': list(range(100, 500, 50)),
            'init_value': 100,
        },
        'max_depth': {
            'domain': list(range(3, 12)),
            'init_value': 6,
        },
        'learning_rate': {
            'domain': np.logspace(-3, 0, 30).tolist(),
            'init_value': 0.1,
        },
    },
}

automl_multi = AutoML()
automl_multi.fit(
    X_train, y_train,
    task="classification",
    time_budget=90,
    estimator_list=['lgbm', 'rf', 'xgboost'],
    custom_hp=multi_custom_hp,
    verbose=1
)

print(f"\n최적 모델: {automl_multi.best_estimator}")
print(f"최적 설정: {automl_multi.best_config}")

탐색 범위 축소로 효율성 향상

# 069 도메인 지식 기반 좁은 범위
focused_space = {
    'lgbm': {
        # 이미 알려진 좋은 범위로 제한
        'n_estimators': {
            'domain': list(range(150, 250, 10)),  # 좁은 범위
            'init_value': 200,
        },
        'num_leaves': {
            'domain': list(range(25, 40)),  # 좁은 범위
            'init_value': 31,
        },
        'learning_rate': {
            'domain': np.linspace(0.05, 0.15, 20).tolist(),  # 좁은 범위
            'init_value': 0.1,
        },
        'min_child_samples': {
            'domain': list(range(15, 25)),  # 좁은 범위
            'init_value': 20,
        },
    }
}

automl_focused = AutoML()
automl_focused.fit(
    X_train, y_train,
    task="classification",
    time_budget=30,  # 짧은 시간에도 효과적
    estimator_list=['lgbm'],
    custom_hp=focused_space,
    verbose=1
)

print(f"\n집중 탐색 결과: {automl_focused.best_config}")

조건부 탐색 공간

from flaml.automl.model import SKLearnEstimator
from sklearn.svm import SVC

class ConditionalSVCLearner(SKLearnEstimator):
    """조건부 파라미터가 있는 SVC"""

    @classmethod
    def search_space(cls, data_size, task):
        return {
            'C': {
                'domain': np.logspace(-3, 3, 50).tolist(),
                'init_value': 1.0,
            },
            'kernel': {
                'domain': ['rbf', 'linear', 'poly', 'sigmoid'],
                'init_value': 'rbf',
            },
            # RBF, poly, sigmoid에만 사용
            'gamma': {
                'domain': np.logspace(-4, 1, 50).tolist(),
                'init_value': 0.1,
            },
            # poly에만 사용
            'degree': {
                'domain': [2, 3, 4, 5],
                'init_value': 3,
            },
            # poly, sigmoid에만 사용
            'coef0': {
                'domain': np.linspace(0, 1, 20).tolist(),
                'init_value': 0.0,
            },
        }

    def config2params(self, config):
        """조건부 파라미터 처리"""
        params = config.copy()
        kernel = params.get('kernel', 'rbf')

        # linear 커널: gamma 불필요
        if kernel == 'linear':
            params.pop('gamma', None)
            params.pop('degree', None)
            params.pop('coef0', None)

        # rbf 커널: degree, coef0 불필요
        elif kernel == 'rbf':
            params.pop('degree', None)
            params.pop('coef0', None)

        # poly 커널: 모두 사용
        elif kernel == 'poly':
            pass  # 모든 파라미터 유지

        # sigmoid 커널: degree 불필요
        elif kernel == 'sigmoid':
            params.pop('degree', None)

        return params

    def __init__(self, task='classification', **config):
        super().__init__(task, **config)
        self.estimator_class = SVC
        self.params['probability'] = True

# 069 테스트
automl_cond = AutoML()
automl_cond.add_learner('cond_svc', ConditionalSVCLearner)
automl_cond.fit(
    X_train, y_train,
    task="classification",
    time_budget=60,
    estimator_list=['cond_svc'],
    verbose=1
)

print(f"\n조건부 탐색 결과: {automl_cond.best_config}")

데이터 크기 기반 동적 탐색 공간

from flaml.config import SAMPLE_SIZE

class DataAdaptiveLearner(SKLearnEstimator):
    """데이터 크기에 적응하는 탐색 공간"""

    @classmethod
    def search_space(cls, data_size, task):
        n_samples = data_size.get(SAMPLE_SIZE, 1000)

        # 데이터 크기별 범위 조정
        if n_samples < 1000:
            # 작은 데이터: 간단한 모델
            n_est_range = list(range(10, 100, 10))
            depth_range = list(range(2, 8))
            lr_range = np.linspace(0.1, 0.3, 10).tolist()
        elif n_samples < 10000:
            # 중간 데이터
            n_est_range = list(range(50, 300, 25))
            depth_range = list(range(3, 12))
            lr_range = np.logspace(-2, 0, 20).tolist()
        else:
            # 큰 데이터: 복잡한 모델 가능
            n_est_range = list(range(100, 1000, 50))
            depth_range = list(range(5, 20))
            lr_range = np.logspace(-3, 0, 30).tolist()

        return {
            'n_estimators': {
                'domain': n_est_range,
                'init_value': n_est_range[len(n_est_range)//2],
                'low_cost_init_value': n_est_range[0],
            },
            'max_depth': {
                'domain': depth_range,
                'init_value': depth_range[len(depth_range)//2],
                'low_cost_init_value': depth_range[0],
            },
            'learning_rate': {
                'domain': lr_range,
                'init_value': lr_range[len(lr_range)//2],
            },
        }

    def __init__(self, task='classification', **config):
        super().__init__(task, **config)
        from sklearn.ensemble import GradientBoostingClassifier
        self.estimator_class = GradientBoostingClassifier

# 069 데이터 크기별 탐색 공간 확인
for n in [500, 5000, 50000]:
    space = DataAdaptiveLearner.search_space({SAMPLE_SIZE: n}, 'classification')
    print(f"\nn={n}:")
    print(f"  n_estimators: {space['n_estimators']['domain'][:5]}...")
    print(f"  max_depth: {space['max_depth']['domain']}")

탐색 공간 시각화

import matplotlib.pyplot as plt

def visualize_search_space(space, param_name):
    """탐색 공간 시각화"""
    domain = space[param_name]['domain']
    init_value = space[param_name].get('init_value')

    plt.figure(figsize=(12, 4))

    if isinstance(domain[0], (int, float)):
        plt.subplot(1, 2, 1)
        plt.hist(domain, bins=30, edgecolor='black', alpha=0.7)
        if init_value:
            plt.axvline(init_value, color='red', linestyle='--', label=f'init: {init_value}')
        plt.xlabel(param_name)
        plt.ylabel('Count')
        plt.title(f'{param_name} Distribution')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(sorted(domain), marker='o', markersize=2)
        plt.xlabel('Index')
        plt.ylabel(param_name)
        plt.title(f'{param_name} Values (sorted)')
    else:
        plt.bar(range(len(domain)), [1]*len(domain))
        plt.xticks(range(len(domain)), domain, rotation=45)
        plt.ylabel('Count')
        plt.title(f'{param_name} Categories')

    plt.tight_layout()
    plt.show()

# 069 시각화 예시
example_space = {
    'learning_rate': {
        'domain': np.logspace(-4, 0, 50).tolist(),
        'init_value': 0.01,
    }
}

visualize_search_space(example_space, 'learning_rate')

탐색 공간 설계 가이드

guide = {
    '상황': [
        '연속형 (큰 범위)',
        '연속형 (작은 범위)',
        '정수형',
        '범주형',
        '빠른 탐색',
        '정밀 탐색'
    ],
    'domain 정의': [
        'np.logspace 사용',
        'np.linspace 사용',
        'list(range()) 사용',
        '문자열 리스트',
        '적은 값 (10-20개)',
        '많은 값 (50-100개)'
    ],
    '예시': [
        'logspace(-5, 0, 50)',
        'linspace(0.1, 0.5, 20)',
        'range(1, 100)',
        "['a', 'b', 'c']",
        'range(50, 200, 50)',
        'linspace(0, 1, 100)'
    ]
}

print("\n탐색 공간 설계 가이드:")
print(pd.DataFrame(guide).to_string(index=False))

정리

custom_hp: AutoML.fit()에서 탐색 공간 수정
domain: 리스트 형태로 가능한 값들 정의
init_value: 탐색 시작점 (기본값)
low_cost_init_value: 빠른 평가용 초기값
조건부 파라미터: config2params()로 처리
동적 탐색 공간: data_size 활용
도메인 지식으로 범위 축소 시 효율 향상

다음 글 예고

다음 글에서는 BlendSearch 알고리즘 이해에 대해 알아보겠습니다. FLAML의 핵심 탐색 알고리즘인 BlendSearch의 원리를 다룹니다.

FLAML AutoML 마스터 시리즈 #069

개요​

실습 환경​

탐색 공간 기본 개념​

FLAML 탐색 공간 형식​

기본 형식​

domain 타입별 정의​

custom_hp로 기존 모델 공간 수정​

여러 모델 탐색 공간 동시 정의​

탐색 범위 축소로 효율성 향상​

조건부 탐색 공간​

데이터 크기 기반 동적 탐색 공간​

탐색 공간 시각화​

탐색 공간 설계 가이드​

정리​

다음 글 예고​

개요