077 메모리 효율적인 학습 전략

키워드: 메모리, memory, 효율적 학습

개요

대용량 데이터나 제한된 환경에서는 메모리 관리가 중요합니다. 이 글에서는 FLAML을 사용할 때 메모리를 효율적으로 사용하는 전략을 알아봅니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl], psutil

pip install flaml[automl] psutil pandas numpy

메모리 사용량 모니터링

import psutil
import numpy as np
import pandas as pd
import gc

def get_memory_usage():
    """현재 메모리 사용량 확인"""
    process = psutil.Process()
    memory_info = process.memory_info()
    return {
        'rss': memory_info.rss / 1e9,  # GB
        'vms': memory_info.vms / 1e9,  # GB
        'percent': process.memory_percent()
    }

def print_memory():
    """메모리 상태 출력"""
    mem = get_memory_usage()
    print(f"메모리: {mem['rss']:.2f}GB (RSS), {mem['percent']:.1f}%")

print("초기 메모리 상태:")
print_memory()

데이터 타입 최적화

# 077 큰 데이터 생성
np.random.seed(42)
n_samples = 100000
n_features = 50

# 077 기본 float64
X_float64 = np.random.randn(n_samples, n_features)
print(f"float64 크기: {X_float64.nbytes / 1e6:.1f} MB")

# 077 float32로 변환
X_float32 = X_float64.astype(np.float32)
print(f"float32 크기: {X_float32.nbytes / 1e6:.1f} MB")

# 077 메모리 절약
saved = (X_float64.nbytes - X_float32.nbytes) / 1e6
print(f"절약: {saved:.1f} MB ({saved / X_float64.nbytes * 1e6 * 100:.0f}%)")

# 077 타겟
y = np.random.randint(0, 2, n_samples)

DataFrame 메모리 최적화

def optimize_dtypes(df):
    """DataFrame 데이터 타입 최적화"""
    optimized = df.copy()

    for col in optimized.columns:
        col_type = optimized[col].dtype

        if col_type == 'float64':
            optimized[col] = optimized[col].astype('float32')
        elif col_type == 'int64':
            col_min = optimized[col].min()
            col_max = optimized[col].max()

            if col_min >= 0:
                if col_max < 255:
                    optimized[col] = optimized[col].astype('uint8')
                elif col_max < 65535:
                    optimized[col] = optimized[col].astype('uint16')
                else:
                    optimized[col] = optimized[col].astype('uint32')
            else:
                if col_min > -128 and col_max < 127:
                    optimized[col] = optimized[col].astype('int8')
                elif col_min > -32768 and col_max < 32767:
                    optimized[col] = optimized[col].astype('int16')
                else:
                    optimized[col] = optimized[col].astype('int32')

    return optimized

# 077 테스트
df = pd.DataFrame(X_float64, columns=[f'f{i}' for i in range(n_features)])
df['target'] = y

print(f"\n원본 DataFrame 메모리: {df.memory_usage(deep=True).sum() / 1e6:.1f} MB")

df_optimized = optimize_dtypes(df)
print(f"최적화 후 메모리: {df_optimized.memory_usage(deep=True).sum() / 1e6:.1f} MB")

샘플링 전략

from sklearn.model_selection import train_test_split

def memory_safe_sampling(X, y, max_samples=50000):
    """메모리 안전 샘플링"""
    if len(X) <= max_samples:
        return X, y

    # 계층적 샘플링
    _, X_sample, _, y_sample = train_test_split(
        X, y,
        test_size=max_samples/len(X),
        stratify=y,
        random_state=42
    )
    return X_sample, y_sample

# 077 샘플링 적용
X_sampled, y_sampled = memory_safe_sampling(X_float32, y, max_samples=30000)
print(f"\n샘플링: {len(X_float32)} → {len(X_sampled)}")
print_memory()

FLAML 메모리 효율 설정

from flaml import AutoML

# 077 메모리 효율적인 FLAML 설정
automl = AutoML()
automl.fit(
    X_sampled, y_sampled,
    task="classification",
    time_budget=60,

    # 메모리 효율 설정
    n_jobs=1,                    # 단일 프로세스 (메모리 복사 방지)
    mem_thres=2e9,               # 메모리 임계값 (2GB)
    free_mem_ratio=0.2,          # 남겨둘 메모리 비율

    # 가벼운 모델 위주
    estimator_list=['lgbm', 'xgboost'],

    verbose=1
)

print(f"\n최적 모델: {automl.best_estimator}")
print_memory()

증분 학습 (Incremental Learning)

from sklearn.linear_model import SGDClassifier

def incremental_train(X, y, batch_size=10000):
    """증분 학습으로 메모리 절약"""
    model = SGDClassifier(loss='log_loss', random_state=42)
    n_samples = len(X)

    for i in range(0, n_samples, batch_size):
        X_batch = X[i:i+batch_size]
        y_batch = y[i:i+batch_size]

        if i == 0:
            classes = np.unique(y)
            model.partial_fit(X_batch, y_batch, classes=classes)
        else:
            model.partial_fit(X_batch, y_batch)

        gc.collect()  # 가비지 컬렉션
        print(f"배치 {i//batch_size + 1}: {min(i+batch_size, n_samples)}/{n_samples}")

    return model

# 077 증분 학습 테스트
print("\n증분 학습:")
incremental_model = incremental_train(X_float32, y, batch_size=20000)

from sklearn.metrics import accuracy_score
y_pred = incremental_model.predict(X_sampled)
print(f"증분 학습 정확도: {accuracy_score(y_sampled, y_pred):.4f}")

LightGBM 메모리 최적화

import lightgbm as lgb

# 077 메모리 효율적인 LightGBM 설정
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,

    # 메모리 최적화
    'feature_pre_filter': True,   # 불필요 특성 사전 필터링
    'max_bin': 63,                # 히스토그램 빈 수 줄이기 (기본 255)
    'min_data_in_bin': 5,
    'bin_construct_sample_cnt': 50000,

    # 모델 복잡도 제한
    'num_leaves': 31,
    'max_depth': 8,
}

X_tr, X_val, y_tr, y_val = train_test_split(X_sampled, y_sampled, test_size=0.2)

train_data = lgb.Dataset(X_tr, label=y_tr)
valid_data = lgb.Dataset(X_val, label=y_val)

model_lgb = lgb.train(
    lgb_params,
    train_data,
    num_boost_round=100,
    valid_sets=[valid_data],
    callbacks=[lgb.early_stopping(20)]
)

print(f"\nLightGBM 메모리 최적화 완료")
print_memory()

가비지 컬렉션

def force_gc():
    """강제 가비지 컬렉션"""
    collected = gc.collect()
    print(f"가비지 컬렉션: {collected}개 객체 해제")
    print_memory()

# 077 불필요한 변수 삭제
del X_float64
force_gc()

메모리 프로파일링

def profile_memory_usage(func, *args, **kwargs):
    """함수 메모리 사용량 프로파일링"""
    gc.collect()
    mem_before = get_memory_usage()['rss']

    result = func(*args, **kwargs)

    gc.collect()
    mem_after = get_memory_usage()['rss']

    print(f"메모리 변화: {mem_before:.2f}GB → {mem_after:.2f}GB")
    print(f"증가량: {(mem_after - mem_before) * 1000:.1f}MB")

    return result

# 077 프로파일링 예시
def train_model():
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=1)
    model.fit(X_sampled, y_sampled)
    return model

print("\n모델 학습 메모리 프로파일링:")
model = profile_memory_usage(train_model)

메모리 효율 가이드

guide = {
    '전략': ['데이터 타입', '샘플링', 'n_jobs', '모델 선택', 'GC'],
    '방법': [
        'float64→float32, int64→int32',
        '계층적 샘플링',
        'n_jobs=1',
        'LightGBM, XGBoost',
        '정기적 gc.collect()'
    ],
    '효과': [
        '50% 메모리 절약',
        '선형적 감소',
        '복사 방지',
        '메모리 효율적',
        '미사용 메모리 해제'
    ]
}

print("\n메모리 효율 전략 가이드:")
print(pd.DataFrame(guide).to_string(index=False))

정리

데이터 타입: float32, int32로 변환
샘플링: 대용량 데이터는 샘플링 필수
n_jobs=1: 메모리 복사 방지
증분 학습: partial_fit으로 배치 학습
LightGBM: max_bin, feature_pre_filter
가비지 컬렉션: gc.collect() 정기 실행

다음 글 예고

다음 글에서는 로깅과 콜백 함수 활용에 대해 알아보겠습니다. FLAML 학습 과정을 모니터링하고 제어하는 방법을 다룹니다.

FLAML AutoML 마스터 시리즈 #077

개요​

실습 환경​

메모리 사용량 모니터링​

데이터 타입 최적화​

DataFrame 메모리 최적화​

샘플링 전략​

FLAML 메모리 효율 설정​

증분 학습 (Incremental Learning)​

LightGBM 메모리 최적화​

가비지 컬렉션​

메모리 프로파일링​

메모리 효율 가이드​

정리​

다음 글 예고​

개요

실습 환경

메모리 사용량 모니터링

데이터 타입 최적화

DataFrame 메모리 최적화

샘플링 전략

FLAML 메모리 효율 설정

증분 학습 (Incremental Learning)

LightGBM 메모리 최적화

가비지 컬렉션

메모리 프로파일링

메모리 효율 가이드

정리

다음 글 예고