080 고급 기능 총정리

키워드: 고급, 총정리, advanced, summary

개요

Part 5에서는 FLAML의 고급 기능들을 학습했습니다. 이 글에서는 Custom Learner, 탐색 알고리즘, 분산 처리, 로깅, 재현성 등 핵심 고급 기능을 종합 정리합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl]

pip install flaml[automl] pandas numpy

Part 5 학습 내용 요약

import numpy as np
import pandas as pd
from flaml import AutoML

topics = {
    '글번호': list(range(66, 80)),
    '주제': [
        'Custom Learner 기초',
        'sklearn 모델 래핑',
        'XGBoost 커스터마이징',
        '탐색 공간 정의',
        'BlendSearch 알고리즘',
        'CFO 최적화',
        '하이퍼파라미터 튜닝 심화',
        '조기 종료 커스터마이징',
        '병렬 처리 가속화',
        'Ray 연동',
        'Spark 연동',
        '메모리 효율 전략',
        '로깅과 콜백',
        '재현성 확보'
    ],
    '핵심 내용': [
        'SKLearnEstimator 상속',
        'sklearn 모델 통합',
        'XGBoost 하이퍼파라미터',
        'search_space() 정의',
        '글로벌+로컬 탐색',
        '비용 효율적 최적화',
        'tune.run() 활용',
        'early_stopping 설정',
        'n_jobs, n_concurrent_trials',
        '분산 AutoML',
        '빅데이터 처리',
        '메모리 관리',
        'verbose, config_history',
        'seed 설정'
    ]
}

print("Part 5 학습 내용 요약:")
print(pd.DataFrame(topics).to_string(index=False))

1. Custom Learner 핵심 정리

from flaml.automl.model import SKLearnEstimator
from flaml import tune

class CustomLearnerTemplate(SKLearnEstimator):
    """Custom Learner 템플릿"""

    @classmethod
    def search_space(cls, data_size, task):
        """탐색 공간 정의 - 핵심 메서드"""
        return {
            'param1': {
                'domain': tune.uniform(0.1, 1.0),
                'init_value': 0.5,
                'low_cost_init_value': 0.1
            },
            'param2': {
                'domain': tune.randint(10, 100),
                'init_value': 50
            }
        }

    def __init__(self, task='classification', **params):
        super().__init__(task, **params)
        self.estimator_class = None  # sklearn 모델 클래스 지정

print("Custom Learner 핵심 요소:")
print("  1. SKLearnEstimator 상속")
print("  2. search_space() 클래스 메서드 정의")
print("  3. domain: 탐색 범위")
print("  4. init_value: 초기값")
print("  5. low_cost_init_value: 저비용 초기값")

2. 탐색 알고리즘 비교

algorithms = {
    '알고리즘': ['BlendSearch', 'CFO', 'Random Search', 'Grid Search'],
    '특징': [
        '글로벌+로컬 탐색 결합',
        '비용 기반 최적화',
        '무작위 샘플링',
        '전체 격자 탐색'
    ],
    '장점': [
        '효율성+탐색력 균형',
        '저비용으로 빠른 수렴',
        '구현 간단',
        '완전 탐색 보장'
    ],
    '적합한 상황': [
        '일반적인 AutoML',
        '시간/비용 제약',
        '탐색 공간 작음',
        '파라미터 적음'
    ],
    'FLAML 사용': [
        '기본값',
        'hpo_method="cfo"',
        'hpo_method="random"',
        '직접 구현'
    ]
}

print("\n탐색 알고리즘 비교:")
print(pd.DataFrame(algorithms).to_string(index=False))

3. 탐색 공간 타입 정리

from flaml import tune

space_types = {
    '타입': ['uniform', 'loguniform', 'randint', 'choice', 'quniform'],
    '용도': [
        '연속 균등 분포',
        '로그 균등 분포',
        '정수 균등 분포',
        '범주형 선택',
        '이산화된 연속값'
    ],
    '예시': [
        'tune.uniform(0.1, 1.0)',
        'tune.loguniform(1e-5, 1e-1)',
        'tune.randint(10, 100)',
        'tune.choice(["a", "b"])',
        'tune.quniform(0, 10, 1)'
    ],
    '적용 파라미터': [
        '비율, 가중치',
        'learning_rate',
        'n_estimators',
        '알고리즘 선택',
        '반올림 필요한 값'
    ]
}

print("\n탐색 공간 타입:")
print(pd.DataFrame(space_types).to_string(index=False))

4. 병렬/분산 처리 가이드

parallel_guide = {
    '방법': ['n_jobs', 'n_concurrent_trials', 'Ray', 'Spark'],
    '레벨': ['모델 내부', 'trial 레벨', '클러스터', '빅데이터'],
    '설정': [
        'n_jobs=-1',
        'n_concurrent_trials=4',
        'use_ray=True',
        '샘플링 후 처리'
    ],
    '효과': [
        '단일 모델 가속',
        '동시 탐색',
        '분산 탐색',
        '대용량 처리'
    ],
    '주의사항': [
        '메모리 증가',
        '코어 수 고려',
        'Ray 설치 필요',
        '데이터 이동 비용'
    ]
}

print("\n병렬/분산 처리 가이드:")
print(pd.DataFrame(parallel_guide).to_string(index=False))

5. 메모리 최적화 전략

memory_strategies = {
    '전략': ['데이터 타입', '샘플링', 'n_jobs 조절', 'GC', '모델 선택'],
    '방법': [
        'float64→float32',
        '계층적 샘플링',
        'n_jobs=1',
        'gc.collect()',
        'LightGBM, XGBoost'
    ],
    '효과': [
        '50% 메모리 절약',
        '선형적 감소',
        '복사 방지',
        '미사용 해제',
        '내장 최적화'
    ]
}

print("\n메모리 최적화 전략:")
print(pd.DataFrame(memory_strategies).to_string(index=False))

6. 로깅 및 모니터링

logging_guide = {
    '항목': ['verbose', 'config_history', '파일 로깅', '콜백'],
    '설명': [
        '출력 레벨 (0-3)',
        '탐색 이력 딕셔너리',
        'FileHandler 추가',
        '이벤트 기반 처리'
    ],
    '활용': [
        '디버깅, 진행 확인',
        '결과 분석, 시각화',
        '로그 보관',
        '실시간 모니터링'
    ]
}

print("\n로깅 및 모니터링:")
print(pd.DataFrame(logging_guide).to_string(index=False))

# 080 config_history 활용 예시
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

automl = AutoML()
automl.fit(X_train, y_train, task="classification", time_budget=20, verbose=0)

print(f"\n탐색 횟수: {len(automl.config_history)}")
print(f"최적 모델: {automl.best_estimator}")

7. 재현성 확보 체크리스트

reproducibility = {
    '항목': ['전역 시드', 'FLAML 시드', '데이터 시드', '환경 저장', 'Docker'],
    '설정': [
        'random.seed(), np.random.seed()',
        'automl.fit(seed=42)',
        'train_test_split(random_state=42)',
        'requirements.txt, env_info.json',
        'Dockerfile'
    ],
    '중요도': ['필수', '필수', '필수', '권장', '완벽한 재현성']
}

print("\n재현성 확보 체크리스트:")
print(pd.DataFrame(reproducibility).to_string(index=False))

8. 고급 기능 사용 패턴

# 080 패턴 1: 커스텀 모델 + 커스텀 탐색 공간
print("\n=== 패턴 1: 커스텀 모델 통합 ===")

custom_hp = {
    'lgbm': {
        'n_estimators': {'domain': tune.randint(50, 300)},
        'num_leaves': {'domain': tune.randint(20, 100)},
        'learning_rate': {'domain': tune.loguniform(0.01, 0.3)},
    }
}

automl_custom = AutoML()
automl_custom.fit(
    X_train, y_train,
    task="classification",
    time_budget=20,
    estimator_list=['lgbm'],
    custom_hp=custom_hp,
    seed=42,
    verbose=0
)
print(f"결과: {automl_custom.best_config}")

# 080 패턴 2: 시간 제약 + 조기 종료
print("\n=== 패턴 2: 효율적 학습 ===")

automl_efficient = AutoML()
automl_efficient.fit(
    X_train, y_train,
    task="classification",
    time_budget=15,
    early_stop=True,
    n_jobs=-1,
    seed=42,
    verbose=0
)
print(f"결과: {automl_efficient.best_estimator}")

# 080 패턴 3: 메모리 효율 + 병렬 제한
print("\n=== 패턴 3: 메모리 효율 ===")

automl_memory = AutoML()
automl_memory.fit(
    X_train.astype(np.float32), y_train,  # float32
    task="classification",
    time_budget=15,
    n_jobs=1,  # 메모리 복사 방지
    estimator_list=['lgbm', 'xgboost'],  # 메모리 효율적 모델
    seed=42,
    verbose=0
)
print(f"결과: {automl_memory.best_estimator}")

9. 성능 튜닝 결정 트리

decision_guide = """
┌─ 시간이 충분한가?
│  ├─ Yes: time_budget 늘리기, 더 넓은 탐색
│  └─ No:
│     ├─ 모델 제한 (estimator_list)
│     └─ early_stop=True

├─ 메모리가 충분한가?
│  ├─ Yes: n_jobs=-1 (모든 코어)
│  └─ No:
│     ├─ n_jobs=1
│     ├─ 데이터 타입 최적화
│     └─ 샘플링 적용

├─ 분산 환경인가?
│  ├─ Yes:
│  │  ├─ 클러스터: use_ray=True
│  │  └─ 빅데이터: Spark 샘플링 후 FLAML
│  └─ No: 로컬 병렬화

└─ 재현성이 필요한가?
   ├─ Yes: seed 설정, 환경 저장, Docker
   └─ No: 기본 설정
"""

print("\n성능 튜닝 결정 가이드:")
print(decision_guide)

10. 고급 기능 종합 예제

import random
import gc

def advanced_automl_pipeline(X, y, time_budget=60, seed=42):
    """고급 기능을 활용한 AutoML 파이프라인"""

    # 1. 재현성 설정
    random.seed(seed)
    np.random.seed(seed)

    # 2. 데이터 타입 최적화
    X = X.astype(np.float32)

    # 3. 데이터 분할
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed
    )

    # 4. 커스텀 탐색 공간
    custom_hp = {
        'lgbm': {
            'n_estimators': {'domain': tune.randint(50, 200)},
            'learning_rate': {'domain': tune.loguniform(0.01, 0.2)},
        },
        'xgboost': {
            'n_estimators': {'domain': tune.randint(50, 200)},
            'learning_rate': {'domain': tune.loguniform(0.01, 0.2)},
        }
    }

    # 5. AutoML 실행
    automl = AutoML()
    automl.fit(
        X_train, y_train,
        task="classification",
        time_budget=time_budget,
        estimator_list=['lgbm', 'xgboost'],
        custom_hp=custom_hp,
        early_stop=True,
        n_jobs=-1,
        seed=seed,
        verbose=1
    )

    # 6. 평가
    from sklearn.metrics import accuracy_score, classification_report
    y_pred = automl.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # 7. 메모리 정리
    gc.collect()

    # 8. 결과 반환
    result = {
        'best_estimator': automl.best_estimator,
        'best_config': automl.best_config,
        'best_loss': automl.best_loss,
        'test_accuracy': accuracy,
        'n_trials': len(automl.config_history),
        'model': automl
    }

    return result

# 080 파이프라인 실행
print("\n=== 고급 AutoML 파이프라인 실행 ===")
result = advanced_automl_pipeline(X, y, time_budget=30, seed=42)

print(f"\n최종 결과:")
print(f"  최적 모델: {result['best_estimator']}")
print(f"  테스트 정확도: {result['test_accuracy']:.4f}")
print(f"  탐색 횟수: {result['n_trials']}")

Part 5 핵심 정리

summary = {
    '영역': ['모델 확장', '탐색 최적화', '성능 가속', '운영'],
    '핵심 기능': [
        'Custom Learner, custom_hp',
        'BlendSearch, CFO, 탐색 공간',
        '병렬 처리, Ray, Spark',
        '로깅, 재현성, 메모리 관리'
    ],
    '필수 파라미터': [
        'SKLearnEstimator, search_space()',
        'tune.uniform/loguniform/randint',
        'n_jobs, n_concurrent_trials',
        'seed, verbose, config_history'
    ]
}

print("\n=== Part 5 핵심 정리 ===")
print(pd.DataFrame(summary).to_string(index=False))

다음 Part 예고

Part 6에서는 실전 프로젝트와 배포를 다룹니다:

실전 프로젝트 (고객 이탈 예측, 사기 탐지 등)
모델 배포 (API, Docker, 클라우드)
모니터링과 유지보수
전체 시리즈 총정리

지금까지 배운 FLAML의 기초부터 고급 기능까지를 실전 프로젝트에 적용하는 방법을 알아보겠습니다.

FLAML AutoML 마스터 시리즈 #080 Part 5 고급 기능 완료

개요​

실습 환경​

Part 5 학습 내용 요약​

1. Custom Learner 핵심 정리​

2. 탐색 알고리즘 비교​

3. 탐색 공간 타입 정리​

4. 병렬/분산 처리 가이드​

5. 메모리 최적화 전략​

6. 로깅 및 모니터링​

7. 재현성 확보 체크리스트​

8. 고급 기능 사용 패턴​

9. 성능 튜닝 결정 트리​

10. 고급 기능 종합 예제​

Part 5 핵심 정리​

다음 Part 예고​

개요