072 하이퍼파라미터 튜닝 심화

키워드: 하이퍼파라미터, 튜닝, 최적화

개요

FLAML의 기본 AutoML 외에도 tune 모듈을 통해 더 세밀한 하이퍼파라미터 튜닝이 가능합니다. 이 글에서는 고급 튜닝 기법과 옵션을 알아봅니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl]

pip install flaml[automl] pandas numpy

tune 모듈 기초

import numpy as np
import pandas as pd
from flaml import tune
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# 072 데이터 준비
X, y = make_classification(n_samples=2000, n_features=20,
                           n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 072 목적 함수 정의
def objective(config):
    model = RandomForestClassifier(
        n_estimators=config['n_estimators'],
        max_depth=config['max_depth'],
        min_samples_split=config['min_samples_split'],
        random_state=42
    )
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    return {'accuracy': scores.mean(), 'std': scores.std()}

# 072 탐색 공간 정의
search_space = {
    'n_estimators': tune.randint(10, 300),
    'max_depth': tune.randint(2, 20),
    'min_samples_split': tune.randint(2, 30),
}

# 072 튜닝 실행
analysis = tune.run(
    objective,
    config=search_space,
    metric='accuracy',
    mode='max',
    num_samples=50,
    time_budget_s=60,
    verbose=1
)

print(f"\n최적 설정: {analysis.best_config}")
print(f"최적 정확도: {analysis.best_result['accuracy']:.4f}")

tune 탐색 공간 타입

# 072 다양한 탐색 공간 타입
space_types = {
    # 1. 정수 균일 분포
    'int_uniform': tune.randint(1, 100),

    # 2. 실수 균일 분포
    'float_uniform': tune.uniform(0.0, 1.0),

    # 3. 로그 균일 분포
    'log_uniform': tune.loguniform(1e-5, 1.0),

    # 4. 범주형
    'categorical': tune.choice(['option_a', 'option_b', 'option_c']),

    # 5. 격자 (특정 값들)
    'grid': tune.grid_search([10, 50, 100, 200]),

    # 6. 정수 로그 균일
    'int_log': tune.lograndint(1, 1000),

    # 7. 양자화 균일
    'quantized': tune.quniform(0, 1, 0.1),

    # 8. 양자화 로그 균일
    'quantized_log': tune.qloguniform(1e-4, 1, 5e-5),
}

print("tune 탐색 공간 타입:")
for name, space in space_types.items():
    print(f"  {name}: {type(space).__name__}")

조건부 하이퍼파라미터

from sklearn.svm import SVC

def objective_with_conditions(config):
    """조건부 하이퍼파라미터가 있는 목적 함수"""
    params = {'C': config['C'], 'kernel': config['kernel']}

    # 커널에 따른 조건부 파라미터
    if config['kernel'] == 'rbf':
        params['gamma'] = config.get('gamma', 'scale')
    elif config['kernel'] == 'poly':
        params['degree'] = config.get('degree', 3)
        params['gamma'] = config.get('gamma', 'scale')

    model = SVC(**params)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    return {'accuracy': scores.mean()}

# 072 조건부 탐색 공간
conditional_space = {
    'C': tune.loguniform(1e-3, 100),
    'kernel': tune.choice(['linear', 'rbf', 'poly']),
    'gamma': tune.loguniform(1e-4, 10),  # rbf, poly에서만 사용
    'degree': tune.randint(2, 6),  # poly에서만 사용
}

analysis_cond = tune.run(
    objective_with_conditions,
    config=conditional_space,
    metric='accuracy',
    mode='max',
    num_samples=30,
    time_budget_s=60,
)

print(f"\n조건부 튜닝 결과: {analysis_cond.best_config}")

다중 목표 최적화

def multi_objective(config):
    """다중 목표: 정확도 최대화 + 학습 시간 최소화"""
    import time

    model = RandomForestClassifier(
        n_estimators=config['n_estimators'],
        max_depth=config['max_depth'],
        random_state=42
    )

    start = time.time()
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    train_time = time.time() - start

    return {
        'accuracy': scores.mean(),
        'train_time': train_time,
        # 복합 지표 (가중 합)
        'combined': scores.mean() - 0.01 * train_time
    }

analysis_multi = tune.run(
    multi_objective,
    config={
        'n_estimators': tune.randint(10, 300),
        'max_depth': tune.randint(2, 15),
    },
    metric='combined',  # 복합 지표로 최적화
    mode='max',
    num_samples=30,
)

print("\n다중 목표 최적화 결과:")
print(f"  최적 설정: {analysis_multi.best_config}")
print(f"  정확도: {analysis_multi.best_result['accuracy']:.4f}")
print(f"  학습 시간: {analysis_multi.best_result['train_time']:.2f}s")

시작점 지정

# 072 알려진 좋은 설정에서 시작
initial_configs = [
    {'n_estimators': 100, 'max_depth': 6, 'min_samples_split': 2},
    {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5},
]

analysis_warmstart = tune.run(
    objective,
    config=search_space,
    metric='accuracy',
    mode='max',
    num_samples=30,
    points_to_evaluate=initial_configs,  # 시작점 지정
    time_budget_s=60,
)

print(f"\n웜스타트 결과: {analysis_warmstart.best_config}")

제약 조건 추가

def objective_with_constraints(config):
    """제약 조건이 있는 목적 함수"""
    # 제약: n_estimators * max_depth < 3000
    if config['n_estimators'] * config['max_depth'] > 3000:
        return {'accuracy': 0.0}  # 제약 위반 시 낮은 점수

    model = RandomForestClassifier(
        n_estimators=config['n_estimators'],
        max_depth=config['max_depth'],
        random_state=42
    )
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    return {'accuracy': scores.mean()}

analysis_constrained = tune.run(
    objective_with_constraints,
    config={
        'n_estimators': tune.randint(10, 500),
        'max_depth': tune.randint(2, 30),
    },
    metric='accuracy',
    mode='max',
    num_samples=50,
)

print(f"\n제약 조건 튜닝 결과: {analysis_constrained.best_config}")
print(f"  복잡도: {analysis_constrained.best_config['n_estimators'] * analysis_constrained.best_config['max_depth']}")

조기 종료 전략

from flaml.tune.scheduler import ASHAScheduler

# 072 ASHA 스케줄러로 조기 종료
scheduler = ASHAScheduler(
    max_resource=100,  # 최대 리소스 (예: epochs, iterations)
    grace_period=10,   # 최소 리소스
    reduction_factor=3 # 감소 비율
)

def objective_with_resource(config, resource=100):
    """리소스 기반 목적 함수"""
    model = RandomForestClassifier(
        n_estimators=int(config['n_estimators'] * resource / 100),
        max_depth=config['max_depth'],
        random_state=42
    )
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    return {'accuracy': scores.mean()}

print("ASHA 스케줄러 설정:")
print("  - 유망하지 않은 설정 조기 종료")
print("  - 리소스 효율적 탐색")

튜닝 결과 분석

def analyze_tuning_results(analysis):
    """튜닝 결과 상세 분석"""
    results = analysis.results

    # 결과를 DataFrame으로 변환
    data = []
    for trial_id, result in results.items():
        row = result.config.copy()
        row['accuracy'] = result.get('accuracy', None)
        data.append(row)

    df = pd.DataFrame(data)

    print("튜닝 결과 분석:")
    print(f"  총 시행 횟수: {len(df)}")
    print(f"  정확도 범위: {df['accuracy'].min():.4f} ~ {df['accuracy'].max():.4f}")
    print(f"  평균 정확도: {df['accuracy'].mean():.4f}")

    # 파라미터별 상관관계
    print("\n파라미터-정확도 상관관계:")
    for col in df.columns:
        if col != 'accuracy' and df[col].dtype in ['int64', 'float64']:
            corr = df[col].corr(df['accuracy'])
            print(f"  {col}: {corr:.3f}")

    return df

df_results = analyze_tuning_results(analysis)

베스트 프랙티스

best_practices = {
    '항목': [
        '탐색 공간',
        '시작점',
        '샘플 수',
        '시간 예산',
        '목적 함수',
        '재현성'
    ],
    '권장사항': [
        '로그 스케일 for 학습률, 정규화',
        'points_to_evaluate로 좋은 설정 제공',
        '파라미터 수 × 10 이상',
        '충분한 시간 확보 (데이터 크기 고려)',
        '간단하고 빠른 평가 (적은 CV fold)',
        'seed 고정, config 저장'
    ]
}

print("\n하이퍼파라미터 튜닝 베스트 프랙티스:")
print(pd.DataFrame(best_practices).to_string(index=False))

정리

tune.run(): 유연한 하이퍼파라미터 튜닝
탐색 공간: randint, uniform, loguniform, choice 등
조건부 파라미터: 목적 함수 내에서 처리
다중 목표: 복합 지표 정의
시작점: points_to_evaluate로 웜스타트
조기 종료: ASHAScheduler 활용

다음 글 예고

다음 글에서는 조기 종료 전략 커스터마이징에 대해 알아보겠습니다. FLAML의 다양한 조기 종료 옵션을 다룹니다.

FLAML AutoML 마스터 시리즈 #072

개요​

실습 환경​

tune 모듈 기초​

tune 탐색 공간 타입​

조건부 하이퍼파라미터​

다중 목표 최적화​

시작점 지정​

제약 조건 추가​

조기 종료 전략​

튜닝 결과 분석​

베스트 프랙티스​

정리​

다음 글 예고​

개요

실습 환경

tune 모듈 기초

tune 탐색 공간 타입

조건부 하이퍼파라미터

다중 목표 최적화

시작점 지정

제약 조건 추가

조기 종료 전략

튜닝 결과 분석

베스트 프랙티스

정리

다음 글 예고