066 Custom Learner 만들기 기초

키워드: Custom Learner, 커스텀 모델, 확장

개요

FLAML은 기본 제공 모델 외에도 사용자가 직접 정의한 모델을 탐색할 수 있습니다. Custom Learner를 통해 도메인 특화 모델이나 새로운 알고리즘을 FLAML의 AutoML 프레임워크에 통합할 수 있습니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], scikit-learn

pip install flaml[automl] scikit-learn pandas numpy

Custom Learner 기본 구조

Learner 클래스 구조

from flaml import AutoML
from flaml.automl.model import SKLearnEstimator
from flaml.config import SAMPLE_SIZE
import numpy as np

# 066 Custom Learner의 기본 구조
class MyCustomLearner(SKLearnEstimator):
    """Custom Learner 기본 템플릿"""

    @classmethod
    def search_space(cls, data_size, task):
        """탐색 공간 정의"""
        # 하이퍼파라미터 탐색 범위 반환
        space = {}
        return space

    def _preprocess(self, X):
        """전처리 로직 (선택사항)"""
        return X

print("Custom Learner 기본 구조:")
print("  1. SKLearnEstimator 상속")
print("  2. search_space() 정의")
print("  3. 선택적으로 _preprocess() 오버라이드")

간단한 Custom Learner 예제

from sklearn.linear_model import Ridge
from flaml.automl.model import SKLearnEstimator

class CustomRidgeLearner(SKLearnEstimator):
    """Ridge 회귀를 위한 Custom Learner"""

    @classmethod
    def search_space(cls, data_size, task):
        space = {
            'alpha': {
                'domain': np.logspace(-3, 3, 1000),  # 0.001 ~ 1000
                'init_value': 1.0,
                'low_cost_init_value': 1.0,
            },
            'fit_intercept': {
                'domain': [True, False],
                'init_value': True,
            },
        }
        return space

    def __init__(self, task='regression', **config):
        super().__init__(task, **config)
        self.estimator_class = Ridge

# 066 테스트 데이터
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=500, n_features=10, noise=10, random_state=42)

# 066 Custom Learner 등록 및 사용
automl = AutoML()
automl.add_learner('custom_ridge', CustomRidgeLearner)

automl.fit(
    X, y,
    task="regression",
    time_budget=30,
    estimator_list=['custom_ridge'],
    metric='mae',
    verbose=1
)

print(f"\n최적 설정: {automl.best_config}")
print(f"MAE: {automl.best_loss:.4f}")

탐색 공간 정의

연속형 하이퍼파라미터

class ContinuousParamLearner(SKLearnEstimator):
    """연속형 하이퍼파라미터 예제"""

    @classmethod
    def search_space(cls, data_size, task):
        space = {
            # 로그 스케일 (큰 범위에 적합)
            'learning_rate': {
                'domain': np.logspace(-5, 0, 1000),  # 1e-5 ~ 1
                'init_value': 0.01,
            },
            # 선형 스케일
            'subsample': {
                'domain': np.linspace(0.5, 1.0, 100),
                'init_value': 0.8,
            },
            # 특정 값들만
            'max_depth': {
                'domain': list(range(3, 15)),
                'init_value': 6,
                'low_cost_init_value': 3,  # 낮은 비용 초기값
            },
        }
        return space

print("연속형 파라미터 정의:")
print("  - domain: 탐색 범위 (리스트 또는 배열)")
print("  - init_value: 초기값")
print("  - low_cost_init_value: 빠른 평가를 위한 초기값")

범주형 하이퍼파라미터

class CategoricalParamLearner(SKLearnEstimator):
    """범주형 하이퍼파라미터 예제"""

    @classmethod
    def search_space(cls, data_size, task):
        space = {
            # 문자열 선택
            'kernel': {
                'domain': ['linear', 'rbf', 'poly'],
                'init_value': 'rbf',
            },
            # 불리언
            'normalize': {
                'domain': [True, False],
                'init_value': True,
            },
            # 정수 선택
            'n_neighbors': {
                'domain': [3, 5, 7, 9, 11],
                'init_value': 5,
            },
        }
        return space

print("범주형 파라미터:")
print("  - 문자열 리스트")
print("  - 불리언 [True, False]")
print("  - 정수 리스트")

조건부 하이퍼파라미터

class ConditionalParamLearner(SKLearnEstimator):
    """조건부 하이퍼파라미터 예제"""

    @classmethod
    def search_space(cls, data_size, task):
        space = {
            'kernel': {
                'domain': ['linear', 'rbf', 'poly'],
                'init_value': 'rbf',
            },
            # kernel이 'poly'일 때만 의미 있음
            'degree': {
                'domain': [2, 3, 4, 5],
                'init_value': 3,
            },
            # kernel이 'rbf'일 때만 의미 있음
            'gamma': {
                'domain': np.logspace(-4, 1, 100),
                'init_value': 0.1,
            },
        }
        return space

    def config2params(self, config):
        """설정을 모델 파라미터로 변환"""
        params = config.copy()

        # 조건부 파라미터 처리
        if params.get('kernel') != 'poly':
            params.pop('degree', None)
        if params.get('kernel') != 'rbf':
            params.pop('gamma', None)

        return params

print("조건부 파라미터:")
print("  - config2params()에서 조건에 따라 파라미터 제거")
print("  - 불필요한 파라미터 탐색 방지")

데이터 크기 기반 탐색 공간

class DataAwareLearner(SKLearnEstimator):
    """데이터 크기에 따른 동적 탐색 공간"""

    @classmethod
    def search_space(cls, data_size, task):
        n_samples = data_size.get(SAMPLE_SIZE, 1000)

        # 데이터 크기에 따라 탐색 범위 조정
        if n_samples < 1000:
            n_estimators_range = list(range(10, 100, 10))
            max_depth_range = list(range(3, 10))
        elif n_samples < 10000:
            n_estimators_range = list(range(50, 300, 25))
            max_depth_range = list(range(5, 15))
        else:
            n_estimators_range = list(range(100, 500, 50))
            max_depth_range = list(range(10, 30, 5))

        space = {
            'n_estimators': {
                'domain': n_estimators_range,
                'init_value': n_estimators_range[len(n_estimators_range)//2],
                'low_cost_init_value': n_estimators_range[0],
            },
            'max_depth': {
                'domain': max_depth_range,
                'init_value': max_depth_range[len(max_depth_range)//2],
                'low_cost_init_value': max_depth_range[0],
            },
        }

        return space

# 066 데이터 크기 확인
print("데이터 크기 기반 탐색 공간:")
for size in [500, 5000, 50000]:
    space = DataAwareLearner.search_space({SAMPLE_SIZE: size}, 'regression')
    print(f"  n={size}: n_estimators={space['n_estimators']['domain']}")

전처리 통합

from sklearn.preprocessing import StandardScaler

class PreprocessingLearner(SKLearnEstimator):
    """전처리가 포함된 Custom Learner"""

    def __init__(self, task='regression', **config):
        super().__init__(task, **config)
        self.estimator_class = Ridge
        self.scaler = None

    def _preprocess(self, X):
        """입력 데이터 전처리"""
        # 학습 시에는 fit_transform, 예측 시에는 transform
        if not hasattr(self, '_fitted') or not self._fitted:
            self.scaler = StandardScaler()
            return self.scaler.fit_transform(X)
        else:
            return self.scaler.transform(X)

    def fit(self, X_train, y_train, budget=None, **kwargs):
        """학습"""
        X_processed = self._preprocess(X_train)
        self._fitted = True
        return super().fit(X_processed, y_train, budget, **kwargs)

    def predict(self, X):
        """예측"""
        X_processed = self._preprocess(X)
        return super().predict(X_processed)

    @classmethod
    def search_space(cls, data_size, task):
        return {
            'alpha': {
                'domain': np.logspace(-3, 3, 100),
                'init_value': 1.0,
            },
        }

print("전처리 통합:")
print("  - _preprocess() 오버라이드")
print("  - fit()과 predict()에서 전처리 적용")

Custom Learner 등록 및 사용

from sklearn.ensemble import RandomForestRegressor

class CustomRFLearner(SKLearnEstimator):
    """Random Forest Custom Learner"""

    @classmethod
    def search_space(cls, data_size, task):
        return {
            'n_estimators': {
                'domain': list(range(10, 200, 10)),
                'init_value': 100,
                'low_cost_init_value': 10,
            },
            'max_depth': {
                'domain': list(range(3, 20)) + [None],
                'init_value': 10,
                'low_cost_init_value': 3,
            },
            'min_samples_split': {
                'domain': [2, 5, 10, 20],
                'init_value': 2,
            },
            'min_samples_leaf': {
                'domain': [1, 2, 4, 8],
                'init_value': 1,
            },
        }

    def __init__(self, task='regression', **config):
        super().__init__(task, **config)
        self.estimator_class = RandomForestRegressor

# 066 데이터
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

X, y = make_regression(n_samples=1000, n_features=20, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 066 AutoML에 Custom Learner 등록
automl = AutoML()
automl.add_learner('custom_rf', CustomRFLearner)

# 066 기존 모델과 함께 탐색
automl.fit(
    X_train, y_train,
    task="regression",
    time_budget=60,
    estimator_list=['lgbm', 'custom_rf'],  # 기존 + 커스텀
    metric='mae',
    verbose=1
)

print(f"\n최적 모델: {automl.best_estimator}")
print(f"최적 설정: {automl.best_config}")

# 066 테스트 평가
from sklearn.metrics import mean_absolute_error
y_pred = automl.predict(X_test)
print(f"Test MAE: {mean_absolute_error(y_test, y_pred):.4f}")

분류용 Custom Learner

from sklearn.ensemble import GradientBoostingClassifier

class CustomGBClassifier(SKLearnEstimator):
    """분류용 Custom Learner"""

    @classmethod
    def search_space(cls, data_size, task):
        return {
            'n_estimators': {
                'domain': list(range(50, 300, 25)),
                'init_value': 100,
                'low_cost_init_value': 50,
            },
            'learning_rate': {
                'domain': np.logspace(-2, 0, 50),
                'init_value': 0.1,
            },
            'max_depth': {
                'domain': list(range(3, 10)),
                'init_value': 5,
                'low_cost_init_value': 3,
            },
            'subsample': {
                'domain': np.linspace(0.6, 1.0, 10),
                'init_value': 0.8,
            },
        }

    def __init__(self, task='classification', **config):
        super().__init__(task, **config)
        self.estimator_class = GradientBoostingClassifier

# 066 분류 데이터
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=20, n_classes=2,
                           n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 066 분류에 Custom Learner 사용
automl_clf = AutoML()
automl_clf.add_learner('custom_gb', CustomGBClassifier)

automl_clf.fit(
    X_train, y_train,
    task="classification",
    time_budget=60,
    estimator_list=['lgbm', 'custom_gb'],
    metric='accuracy',
    verbose=1
)

print(f"\n최적 분류 모델: {automl_clf.best_estimator}")

Custom Learner 체크리스트

checklist = {
    '항목': [
        'SKLearnEstimator 상속',
        'search_space() 정의',
        '__init__에서 estimator_class 설정',
        'domain에 탐색 범위 지정',
        'init_value 설정',
        'low_cost_init_value (선택)',
        '전처리 필요시 _preprocess()',
        'add_learner()로 등록'
    ],
    '설명': [
        'Custom Learner의 기본 클래스',
        '하이퍼파라미터 탐색 공간',
        '실제 sklearn 모델 클래스',
        '리스트 또는 배열로 범위 지정',
        '탐색 시작점',
        '빠른 평가를 위한 초기값',
        '스케일링, 인코딩 등',
        'AutoML 객체에 등록'
    ]
}

print("\nCustom Learner 체크리스트:")
import pandas as pd
print(pd.DataFrame(checklist).to_string(index=False))

정리

Custom Learner: SKLearnEstimator 상속하여 구현
search_space(): 하이퍼파라미터 탐색 공간 정의
domain: 탐색 범위 (리스트, 배열)
init_value: 초기값, low_cost_init_value는 빠른 탐색용
add_learner(): AutoML에 커스텀 모델 등록
기존 FLAML 모델과 함께 탐색 가능

다음 글 예고

다음 글에서는 Custom Learner - sklearn 모델 래핑에 대해 알아보겠습니다. 다양한 sklearn 모델을 FLAML에 통합하는 방법을 상세히 다룹니다.

FLAML AutoML 마스터 시리즈 #066

개요​

실습 환경​

Custom Learner 기본 구조​

Learner 클래스 구조​

간단한 Custom Learner 예제​

탐색 공간 정의​

연속형 하이퍼파라미터​

범주형 하이퍼파라미터​

조건부 하이퍼파라미터​

데이터 크기 기반 탐색 공간​

전처리 통합​

Custom Learner 등록 및 사용​

분류용 Custom Learner​

Custom Learner 체크리스트​

정리​

다음 글 예고​

개요