068 Custom Learner - XGBoost 커스터마이징

키워드: XGBoost, 커스터마이징, 하이퍼파라미터

개요

XGBoost는 FLAML에서 기본 지원하지만, 더 세밀한 하이퍼파라미터 튜닝이 필요한 경우 Custom Learner로 확장할 수 있습니다. 이 글에서는 XGBoost의 다양한 파라미터를 FLAML에서 최적화하는 방법을 알아봅니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], xgboost

pip install flaml[automl] xgboost pandas numpy

XGBoost 파라미터 이해

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split
from flaml import AutoML
from flaml.automl.model import SKLearnEstimator
import xgboost as xgb

# 068 데이터 준비
X_clf, y_clf = make_classification(n_samples=5000, n_features=30,
                                    n_informative=15, n_classes=2,
                                    random_state=42)
X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)

X_reg, y_reg = make_regression(n_samples=5000, n_features=30,
                                noise=10, random_state=42)
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

print("XGBoost 주요 파라미터 카테고리:")
params = {
    '카테고리': ['트리 구조', '부스팅', '정규화', '학습률', '샘플링'],
    '파라미터': [
        'max_depth, min_child_weight',
        'n_estimators, learning_rate',
        'reg_alpha (L1), reg_lambda (L2)',
        'eta, gamma',
        'subsample, colsample_bytree'
    ]
}
print(pd.DataFrame(params).to_string(index=False))

기본 XGBoost Custom Learner

class CustomXGBClassifier(SKLearnEstimator):
    """확장된 XGBoost 분류기"""

    @classmethod
    def search_space(cls, data_size, task):
        return {
            # 트리 구조
            'max_depth': {
                'domain': list(range(3, 15)),
                'init_value': 6,
                'low_cost_init_value': 3,
            },
            'min_child_weight': {
                'domain': list(range(1, 10)),
                'init_value': 1,
            },

            # 부스팅
            'n_estimators': {
                'domain': list(range(50, 500, 25)),
                'init_value': 100,
                'low_cost_init_value': 50,
            },
            'learning_rate': {
                'domain': np.logspace(-3, 0, 50),
                'init_value': 0.1,
            },

            # 정규화
            'reg_alpha': {
                'domain': np.logspace(-5, 2, 50),
                'init_value': 0.0,
            },
            'reg_lambda': {
                'domain': np.logspace(-5, 2, 50),
                'init_value': 1.0,
            },

            # 샘플링
            'subsample': {
                'domain': np.linspace(0.5, 1.0, 10),
                'init_value': 1.0,
            },
            'colsample_bytree': {
                'domain': np.linspace(0.5, 1.0, 10),
                'init_value': 1.0,
            },
        }

    def __init__(self, task='classification', **config):
        super().__init__(task, **config)
        self.estimator_class = xgb.XGBClassifier
        # 기본 설정
        self.params['use_label_encoder'] = False
        self.params['eval_metric'] = 'logloss'

# 068 기본 테스트
automl = AutoML()
automl.add_learner('custom_xgb', CustomXGBClassifier)

automl.fit(
    X_clf_train, y_clf_train,
    task="classification",
    time_budget=60,
    estimator_list=['custom_xgb'],
    metric='accuracy',
    verbose=1
)

print(f"\n최적 설정: {automl.best_config}")
print(f"정확도: {1 - automl.best_loss:.4f}")

고급 XGBoost 파라미터

트리 구조 상세

class XGBTreeStructureLearner(SKLearnEstimator):
    """트리 구조 파라미터 중심 XGBoost"""

    @classmethod
    def search_space(cls, data_size, task):
        return {
            # 트리 깊이
            'max_depth': {
                'domain': list(range(1, 20)),
                'init_value': 6,
                'low_cost_init_value': 3,
            },
            # 리프 노드 최소 가중치
            'min_child_weight': {
                'domain': np.logspace(-2, 3, 50),
                'init_value': 1,
            },
            # 분할 최소 손실 감소
            'gamma': {
                'domain': np.logspace(-5, 2, 50),
                'init_value': 0,
            },
            # 최대 리프 노드 수 (None = 무제한)
            'max_leaves': {
                'domain': [0] + list(range(10, 100, 10)),  # 0 = 무제한
                'init_value': 0,
            },
            # 트리 성장 방식
            'grow_policy': {
                'domain': ['depthwise', 'lossguide'],
                'init_value': 'depthwise',
            },
            'n_estimators': {
                'domain': list(range(100, 500, 50)),
                'init_value': 100,
            },
            'learning_rate': {
                'domain': np.logspace(-2, 0, 30),
                'init_value': 0.1,
            },
        }

    def config2params(self, config):
        params = config.copy()
        # lossguide일 때만 max_leaves 사용
        if params.get('grow_policy') != 'lossguide':
            params.pop('max_leaves', None)
        return params

    def __init__(self, task='classification', **config):
        super().__init__(task, **config)
        self.estimator_class = xgb.XGBClassifier
        self.params['use_label_encoder'] = False

# 068 테스트
automl_tree = AutoML()
automl_tree.add_learner('xgb_tree', XGBTreeStructureLearner)

automl_tree.fit(
    X_clf_train, y_clf_train,
    task="classification",
    time_budget=60,
    estimator_list=['xgb_tree'],
    verbose=1
)

print(f"\n트리 구조 최적화 결과: {automl_tree.best_config}")

정규화 상세

class XGBRegularizationLearner(SKLearnEstimator):
    """정규화 파라미터 중심 XGBoost"""

    @classmethod
    def search_space(cls, data_size, task):
        return {
            # L1 정규화 (Lasso)
            'reg_alpha': {
                'domain': np.logspace(-6, 3, 100),
                'init_value': 0.0,
            },
            # L2 정규화 (Ridge)
            'reg_lambda': {
                'domain': np.logspace(-6, 3, 100),
                'init_value': 1.0,
            },
            # 분할 최소 손실 감소
            'gamma': {
                'domain': np.logspace(-6, 2, 50),
                'init_value': 0.0,
            },
            # 기본 파라미터
            'max_depth': {
                'domain': [4, 5, 6, 7, 8],
                'init_value': 6,
            },
            'n_estimators': {
                'domain': list(range(100, 300, 50)),
                'init_value': 100,
            },
            'learning_rate': {
                'domain': [0.05, 0.1, 0.15, 0.2],
                'init_value': 0.1,
            },
        }

    def __init__(self, task='classification', **config):
        super().__init__(task, **config)
        self.estimator_class = xgb.XGBClassifier
        self.params['use_label_encoder'] = False

print("정규화 파라미터:")
print("  - reg_alpha (L1): 특성 선택 효과")
print("  - reg_lambda (L2): 가중치 축소 효과")
print("  - gamma: 분할 조건 강화")

샘플링 상세

class XGBSamplingLearner(SKLearnEstimator):
    """샘플링 파라미터 중심 XGBoost"""

    @classmethod
    def search_space(cls, data_size, task):
        return {
            # 행 샘플링
            'subsample': {
                'domain': np.linspace(0.5, 1.0, 20),
                'init_value': 0.8,
            },
            # 열 샘플링 (트리별)
            'colsample_bytree': {
                'domain': np.linspace(0.5, 1.0, 20),
                'init_value': 0.8,
            },
            # 열 샘플링 (레벨별)
            'colsample_bylevel': {
                'domain': np.linspace(0.5, 1.0, 20),
                'init_value': 1.0,
            },
            # 열 샘플링 (노드별)
            'colsample_bynode': {
                'domain': np.linspace(0.5, 1.0, 20),
                'init_value': 1.0,
            },
            # 기본 파라미터
            'max_depth': {
                'domain': [5, 6, 7, 8],
                'init_value': 6,
            },
            'n_estimators': {
                'domain': list(range(100, 400, 50)),
                'init_value': 100,
            },
            'learning_rate': {
                'domain': [0.05, 0.1, 0.15],
                'init_value': 0.1,
            },
        }

    def __init__(self, task='classification', **config):
        super().__init__(task, **config)
        self.estimator_class = xgb.XGBClassifier
        self.params['use_label_encoder'] = False

print("샘플링 파라미터:")
print("  - subsample: 각 트리 학습에 사용할 데이터 비율")
print("  - colsample_bytree: 트리당 특성 샘플링")
print("  - colsample_bylevel: 레벨당 특성 샘플링")
print("  - colsample_bynode: 노드당 특성 샘플링")

XGBoost 회귀

class CustomXGBRegressor(SKLearnEstimator):
    """확장된 XGBoost 회귀"""

    @classmethod
    def search_space(cls, data_size, task):
        return {
            'max_depth': {
                'domain': list(range(3, 12)),
                'init_value': 6,
                'low_cost_init_value': 3,
            },
            'min_child_weight': {
                'domain': list(range(1, 15)),
                'init_value': 1,
            },
            'n_estimators': {
                'domain': list(range(50, 500, 25)),
                'init_value': 100,
                'low_cost_init_value': 50,
            },
            'learning_rate': {
                'domain': np.logspace(-3, 0, 50),
                'init_value': 0.1,
            },
            'reg_alpha': {
                'domain': np.logspace(-5, 2, 50),
                'init_value': 0.0,
            },
            'reg_lambda': {
                'domain': np.logspace(-5, 2, 50),
                'init_value': 1.0,
            },
            'subsample': {
                'domain': np.linspace(0.5, 1.0, 10),
                'init_value': 1.0,
            },
            'colsample_bytree': {
                'domain': np.linspace(0.5, 1.0, 10),
                'init_value': 1.0,
            },
            # 회귀 목적함수
            'objective': {
                'domain': ['reg:squarederror', 'reg:absoluteerror', 'reg:pseudohubererror'],
                'init_value': 'reg:squarederror',
            },
        }

    def __init__(self, task='regression', **config):
        super().__init__(task, **config)
        self.estimator_class = xgb.XGBRegressor

# 068 회귀 테스트
automl_reg = AutoML()
automl_reg.add_learner('custom_xgb_reg', CustomXGBRegressor)

automl_reg.fit(
    X_reg_train, y_reg_train,
    task="regression",
    time_budget=60,
    estimator_list=['custom_xgb_reg'],
    metric='mae',
    verbose=1
)

print(f"\n회귀 최적 설정: {automl_reg.best_config}")
print(f"MAE: {automl_reg.best_loss:.4f}")

GPU 가속 XGBoost

class XGBGPULearner(SKLearnEstimator):
    """GPU 가속 XGBoost"""

    @classmethod
    def search_space(cls, data_size, task):
        return {
            'max_depth': {
                'domain': list(range(3, 15)),
                'init_value': 6,
            },
            'n_estimators': {
                'domain': list(range(100, 1000, 100)),
                'init_value': 100,
            },
            'learning_rate': {
                'domain': np.logspace(-3, 0, 30),
                'init_value': 0.1,
            },
            'subsample': {
                'domain': np.linspace(0.6, 1.0, 10),
                'init_value': 0.8,
            },
            'colsample_bytree': {
                'domain': np.linspace(0.6, 1.0, 10),
                'init_value': 0.8,
            },
        }

    def __init__(self, task='classification', **config):
        super().__init__(task, **config)
        self.estimator_class = xgb.XGBClassifier
        # GPU 설정
        self.params['tree_method'] = 'hist'  # 또는 'gpu_hist' (GPU 사용시)
        self.params['device'] = 'cpu'  # 또는 'cuda' (GPU 사용시)
        self.params['use_label_encoder'] = False

print("GPU 가속 옵션:")
print("  - tree_method: 'gpu_hist' (GPU) 또는 'hist' (CPU)")
print("  - device: 'cuda' (GPU) 또는 'cpu'")
print("  - GPU 메모리 부족시 'max_bin' 조정")

불균형 데이터용 XGBoost

class XGBImbalancedLearner(SKLearnEstimator):
    """불균형 데이터용 XGBoost"""

    @classmethod
    def search_space(cls, data_size, task):
        return {
            'max_depth': {
                'domain': list(range(3, 10)),
                'init_value': 5,
            },
            'n_estimators': {
                'domain': list(range(50, 300, 25)),
                'init_value': 100,
            },
            'learning_rate': {
                'domain': np.logspace(-2, 0, 30),
                'init_value': 0.1,
            },
            # 불균형 처리
            'scale_pos_weight': {
                'domain': np.logspace(-1, 2, 30),
                'init_value': 1.0,
            },
            'max_delta_step': {
                'domain': [0, 1, 2, 3, 5, 10],
                'init_value': 0,
            },
        }

    def __init__(self, task='classification', **config):
        super().__init__(task, **config)
        self.estimator_class = xgb.XGBClassifier
        self.params['use_label_encoder'] = False

# 068 불균형 데이터 생성
X_imb, y_imb = make_classification(n_samples=5000, n_features=20,
                                    weights=[0.95, 0.05],  # 5% 양성
                                    random_state=42)
X_imb_train, X_imb_test, y_imb_train, y_imb_test = train_test_split(
    X_imb, y_imb, test_size=0.2, random_state=42
)

print(f"불균형 비율: {np.mean(y_imb):.2%} 양성")

# 068 불균형 데이터 테스트
automl_imb = AutoML()
automl_imb.add_learner('xgb_imb', XGBImbalancedLearner)

automl_imb.fit(
    X_imb_train, y_imb_train,
    task="classification",
    time_budget=60,
    estimator_list=['xgb_imb'],
    metric='roc_auc',  # 불균형에 적합한 지표
    verbose=1
)

print(f"\n불균형 데이터 최적 설정: {automl_imb.best_config}")

조기 종료가 있는 XGBoost

class XGBEarlyStopLearner(SKLearnEstimator):
    """조기 종료 지원 XGBoost"""

    @classmethod
    def search_space(cls, data_size, task):
        return {
            'max_depth': {
                'domain': list(range(3, 12)),
                'init_value': 6,
            },
            'n_estimators': {
                'domain': [500, 1000, 2000],  # 큰 값 (조기 종료로 제어)
                'init_value': 500,
            },
            'learning_rate': {
                'domain': np.logspace(-3, -1, 20),
                'init_value': 0.05,
            },
            'early_stopping_rounds': {
                'domain': [10, 20, 50, 100],
                'init_value': 50,
            },
        }

    def fit(self, X_train, y_train, budget=None, **kwargs):
        """조기 종료를 위한 fit 오버라이드"""
        from sklearn.model_selection import train_test_split

        # 검증 세트 분리
        X_tr, X_val, y_tr, y_val = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )

        # 조기 종료 설정
        early_stop = self.params.pop('early_stopping_rounds', 50)

        self._model = self.estimator_class(**self.params)
        self._model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            verbose=False
        )

        return self._model.best_iteration

    def __init__(self, task='classification', **config):
        super().__init__(task, **config)
        self.estimator_class = xgb.XGBClassifier
        self.params['use_label_encoder'] = False
        self.params['early_stopping_rounds'] = 50

print("조기 종료 XGBoost:")
print("  - n_estimators를 크게 설정")
print("  - early_stopping_rounds로 실제 트리 수 제어")
print("  - 검증 세트 필요")

FLAML 기본 XGBoost와 비교

# 068 기본 FLAML XGBoost
automl_default = AutoML()
automl_default.fit(
    X_clf_train, y_clf_train,
    task="classification",
    time_budget=60,
    estimator_list=['xgboost'],
    verbose=0
)

# 068 Custom XGBoost
automl_custom = AutoML()
automl_custom.add_learner('custom_xgb', CustomXGBClassifier)
automl_custom.fit(
    X_clf_train, y_clf_train,
    task="classification",
    time_budget=60,
    estimator_list=['custom_xgb'],
    verbose=0
)

from sklearn.metrics import accuracy_score

# 068 비교
y_pred_default = automl_default.predict(X_clf_test)
y_pred_custom = automl_custom.predict(X_clf_test)

print("FLAML 기본 vs Custom XGBoost:")
print(f"  기본: {accuracy_score(y_clf_test, y_pred_default):.4f}")
print(f"  커스텀: {accuracy_score(y_clf_test, y_pred_custom):.4f}")
print(f"\n기본 설정: {automl_default.best_config}")
print(f"커스텀 설정: {automl_custom.best_config}")

XGBoost 파라미터 가이드

guide = {
    '목적': ['과적합 방지', '학습 속도', '정확도 향상', '불균형 처리'],
    '주요 파라미터': [
        'max_depth↓, reg_alpha/lambda↑',
        'learning_rate↑, subsample↓',
        'n_estimators↑, learning_rate↓',
        'scale_pos_weight, max_delta_step'
    ],
    '권장 범위': [
        'depth: 3-8, reg: 1e-3~10',
        'lr: 0.1-0.3, subsample: 0.6-0.8',
        'n_est: 100-1000, lr: 0.01-0.1',
        'weight: 클래스 비율의 역수'
    ]
}

print("\nXGBoost 파라미터 가이드:")
print(pd.DataFrame(guide).to_string(index=False))

정리

트리 구조: max_depth, min_child_weight, gamma
정규화: reg_alpha (L1), reg_lambda (L2)
샘플링: subsample, colsample_bytree/bylevel/bynode
불균형: scale_pos_weight, max_delta_step
GPU: tree_method='gpu_hist', device='cuda'
파라미터 간 상호작용 고려 필요

다음 글 예고

다음 글에서는 탐색 공간(Search Space) 직접 정의하기에 대해 알아보겠습니다. FLAML의 탐색 공간을 세밀하게 커스터마이징하는 방법을 다룹니다.

FLAML AutoML 마스터 시리즈 #068

개요​

실습 환경​

XGBoost 파라미터 이해​

기본 XGBoost Custom Learner​

고급 XGBoost 파라미터​

트리 구조 상세​

정규화 상세​

샘플링 상세​

XGBoost 회귀​

GPU 가속 XGBoost​

불균형 데이터용 XGBoost​

조기 종료가 있는 XGBoost​

FLAML 기본 XGBoost와 비교​

XGBoost 파라미터 가이드​

정리​

다음 글 예고​

개요