092 PyCaret의 확장과 커스터마이징

키워드: 커스터마이징, 확장

개요

PyCaret은 기본 기능 외에도 커스텀 모델, 메트릭, 전처리기를 추가할 수 있습니다. 고급 사용자를 위한 확장 방법과 PyCaret 내부 구조를 활용하는 방법을 알아봅니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pycaret[full]>=3.0

커스텀 모델 추가

from pycaret.classification import *
from pycaret.datasets import get_data
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

# 092 커스텀 분류기 정의
class SimpleThresholdClassifier(BaseEstimator, ClassifierMixin):
    """단순 임계값 기반 분류기"""

    def __init__(self, threshold=0.5, feature_idx=0):
        self.threshold = threshold
        self.feature_idx = feature_idx

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        return self

    def predict(self, X):
        if hasattr(X, 'iloc'):
            X = X.values
        return (X[:, self.feature_idx] > self.threshold).astype(int)

    def predict_proba(self, X):
        if hasattr(X, 'iloc'):
            X = X.values
        prob = (X[:, self.feature_idx] - X[:, self.feature_idx].min()) / \
               (X[:, self.feature_idx].max() - X[:, self.feature_idx].min() + 1e-10)
        return np.column_stack([1 - prob, prob])

# 092 데이터 로드
data = get_data('diabetes')

# 092 환경 설정
clf = setup(data, target='Class variable', session_id=42, verbose=False)

# 092 커스텀 모델 생성
custom_model = create_model(SimpleThresholdClassifier())

# 092 결과 확인
results = pull()
print(results)

커스텀 메트릭 추가

from pycaret.classification import *
from sklearn.metrics import make_scorer, matthews_corrcoef, cohen_kappa_score

# 092 커스텀 메트릭 정의
mcc_scorer = make_scorer(matthews_corrcoef)
kappa_scorer = make_scorer(cohen_kappa_score)

clf = setup(
    data,
    target='Class variable',
    custom_scorer={'MCC': mcc_scorer, 'Kappa': kappa_scorer},
    session_id=42,
    verbose=False
)

# 092 모델 생성 (커스텀 메트릭 포함)
rf = create_model('rf')

# 092 결과에 MCC, Kappa 포함됨
results = pull()
print(results)

커스텀 전처리기

from pycaret.classification import *
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class OutlierClipper(BaseEstimator, TransformerMixin):
    """이상치 클리핑 전처리기"""

    def __init__(self, lower_percentile=1, upper_percentile=99):
        self.lower_percentile = lower_percentile
        self.upper_percentile = upper_percentile

    def fit(self, X, y=None):
        if hasattr(X, 'values'):
            X = X.values
        self.lower_ = np.percentile(X, self.lower_percentile, axis=0)
        self.upper_ = np.percentile(X, self.upper_percentile, axis=0)
        return self

    def transform(self, X):
        if hasattr(X, 'values'):
            X_out = X.values.copy()
        else:
            X_out = X.copy()
        X_out = np.clip(X_out, self.lower_, self.upper_)
        return X_out

# 092 커스텀 파이프라인에서 사용
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

custom_pipeline = Pipeline([
    ('clipper', OutlierClipper()),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 092 PyCaret에서 파이프라인 사용
clf = setup(data, target='Class variable', session_id=42, verbose=False)
custom_result = create_model(custom_pipeline)

get_config로 내부 접근

from pycaret.classification import *

clf = setup(data, target='Class variable', session_id=42, verbose=False)

# 092 사용 가능한 설정 확인
print("=== 주요 설정 ===")

# 092 데이터 관련
X_train = get_config('X_train')
y_train = get_config('y_train')
X_test = get_config('X_test')
y_test = get_config('y_test')

# 092 변환된 데이터
X_train_transformed = get_config('X_train_transformed')

# 092 전처리 파이프라인
pipeline = get_config('pipeline')

# 092 모델 목록
models = get_config('models')

print(f"학습 데이터 크기: {X_train.shape}")
print(f"테스트 데이터 크기: {X_test.shape}")
print(f"변환된 특성 수: {X_train_transformed.shape[1]}")
print(f"파이프라인 단계: {len(pipeline.steps)}")

set_config로 설정 변경

from pycaret.classification import *

clf = setup(data, target='Class variable', session_id=42, verbose=False)

# 092 현재 설정 확인
print(f"현재 폴드 수: {get_config('fold')}")

# 092 설정 변경
set_config('fold', 10)
print(f"변경된 폴드 수: {get_config('fold')}")

# 092 변경된 설정으로 모델 생성
rf = create_model('rf')  # 10-fold CV 적용됨

커스텀 튜닝 그리드

from pycaret.classification import *

clf = setup(data, target='Class variable', session_id=42, verbose=False)

# 092 기본 모델 생성
rf = create_model('rf')

# 092 커스텀 튜닝 그리드
custom_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# 092 커스텀 그리드로 튜닝
tuned_rf = tune_model(rf, custom_grid=custom_grid, n_iter=50)

커스텀 교차 검증

from pycaret.classification import *
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold

# 092 커스텀 CV 정의
custom_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

clf = setup(
    data,
    target='Class variable',
    fold_strategy=custom_cv,
    session_id=42,
    verbose=False
)

# 092 반복 CV로 모델 평가
rf = create_model('rf')

모델 래퍼 만들기

from pycaret.classification import *
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

class CalibratedWrapper(BaseEstimator, ClassifierMixin):
    """확률 보정 래퍼"""

    def __init__(self, base_model, method='isotonic'):
        self.base_model = base_model
        self.method = method
        self.calibrator = None

    def fit(self, X, y):
        from sklearn.calibration import CalibratedClassifierCV

        self.calibrator = CalibratedClassifierCV(
            self.base_model,
            method=self.method,
            cv=5
        )
        self.calibrator.fit(X, y)
        self.classes_ = self.calibrator.classes_
        return self

    def predict(self, X):
        return self.calibrator.predict(X)

    def predict_proba(self, X):
        return self.calibrator.predict_proba(X)

# 092 사용
clf = setup(data, target='Class variable', session_id=42, verbose=False)

from sklearn.ensemble import RandomForestClassifier
base_rf = RandomForestClassifier(n_estimators=100, random_state=42)
calibrated_model = CalibratedWrapper(base_rf)

result = create_model(calibrated_model)

후처리 함수 만들기

from pycaret.classification import *
import pandas as pd
import numpy as np

clf = setup(data, target='Class variable', session_id=42, verbose=False)
rf = create_model('rf')

def custom_predict(model, data, threshold=0.5, return_proba=True):
    """커스텀 예측 함수"""

    # 기본 예측
    predictions = predict_model(model, data=data)

    # 확률 컬럼 찾기
    proba_cols = [c for c in predictions.columns if 'prediction_score' in c.lower()]

    if proba_cols and return_proba:
        # 커스텀 임계값 적용
        if len(proba_cols) >= 1:
            predictions['custom_prediction'] = (
                predictions[proba_cols[-1]] >= threshold
            ).astype(int)

    return predictions

# 092 다양한 임계값으로 예측
for thresh in [0.3, 0.5, 0.7]:
    result = custom_predict(rf, data.head(10), threshold=thresh)
    print(f"\n임계값 {thresh}:")
    print(result[['prediction_label', 'custom_prediction']].value_counts())

배치 처리 유틸리티

from pycaret.classification import *
import pandas as pd
from datetime import datetime

def batch_experiment(data, target, models, experiment_name):
    """배치 실험 실행"""

    results = []

    clf = setup(
        data,
        target=target,
        log_experiment=True,
        experiment_name=experiment_name,
        session_id=42,
        verbose=False
    )

    for model_id in models:
        print(f"Processing {model_id}...")

        # 모델 생성
        model = create_model(model_id, verbose=False)
        create_result = pull()

        # 튜닝
        tuned = tune_model(model, n_iter=20, verbose=False)
        tune_result = pull()

        results.append({
            'model': model_id,
            'base_accuracy': create_result['Accuracy'].values[0],
            'tuned_accuracy': tune_result['Accuracy'].values[0],
            'improvement': tune_result['Accuracy'].values[0] - create_result['Accuracy'].values[0]
        })

    return pd.DataFrame(results)

# 092 실행
models = ['rf', 'xgboost', 'lightgbm']
batch_results = batch_experiment(data, 'Class variable', models, 'batch_exp')
print(batch_results)

플러그인 구조 만들기

from pycaret.classification import *
from abc import ABC, abstractmethod

class PyCaretPlugin(ABC):
    """PyCaret 플러그인 기본 클래스"""

    @abstractmethod
    def name(self):
        pass

    @abstractmethod
    def process(self, model, data):
        pass

class ExplainerPlugin(PyCaretPlugin):
    """모델 설명 플러그인"""

    def name(self):
        return "Model Explainer"

    def process(self, model, data):
        import shap

        X = data.drop(data.columns[-1], axis=1)

        # SHAP 계산
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X)

        # 특성 중요도
        importance = pd.DataFrame({
            'feature': X.columns,
            'importance': abs(shap_values[1]).mean(axis=0)
        }).sort_values('importance', ascending=False)

        return importance

# 092 사용
clf = setup(data, target='Class variable', session_id=42, verbose=False)
rf = create_model('rf')

explainer = ExplainerPlugin()
importance = explainer.process(rf, data)
print(importance)

콜백 시스템

from pycaret.classification import *
import time

class TrainingCallback:
    """학습 콜백"""

    def __init__(self):
        self.history = []

    def on_start(self, model_name):
        self.start_time = time.time()
        print(f"[시작] {model_name} 학습 시작")

    def on_end(self, model_name, metrics):
        elapsed = time.time() - self.start_time
        self.history.append({
            'model': model_name,
            'time': elapsed,
            'metrics': metrics
        })
        print(f"[완료] {model_name}: {elapsed:.2f}초")

# 092 콜백 사용
callback = TrainingCallback()

clf = setup(data, target='Class variable', session_id=42, verbose=False)

for model_id in ['rf', 'xgboost', 'lightgbm']:
    callback.on_start(model_id)
    model = create_model(model_id, verbose=False)
    metrics = pull()
    callback.on_end(model_id, metrics)

print("\n=== 학습 히스토리 ===")
for h in callback.history:
    print(f"{h['model']}: {h['time']:.2f}초")

환경 재사용

from pycaret.classification import *
import pickle

# 092 첫 번째 세션
clf = setup(data, target='Class variable', session_id=42, verbose=False)
rf = create_model('rf')

# 092 설정 저장
config = {
    'pipeline': get_config('pipeline'),
    'X_train': get_config('X_train'),
    'y_train': get_config('y_train')
}

with open('pycaret_config.pkl', 'wb') as f:
    pickle.dump(config, f)

# 092 다른 세션에서 로드
with open('pycaret_config.pkl', 'rb') as f:
    loaded_config = pickle.load(f)

# 092 파이프라인 재사용
pipeline = loaded_config['pipeline']
new_data_transformed = pipeline.transform(data.drop('Class variable', axis=1))

정리

커스텀 모델: sklearn 호환 인터페이스
커스텀 메트릭: make_scorer 사용
get_config/set_config: 내부 설정 접근
커스텀 그리드: 세밀한 튜닝 제어
래퍼 패턴: 모델 기능 확장
플러그인 구조: 재사용 가능한 확장

다음 글 예고

다음 글에서는 모델 저장과 로드를 다룹니다.

PyCaret 머신러닝 마스터 시리즈 #092

개요​

실습 환경​

커스텀 모델 추가​

커스텀 메트릭 추가​

커스텀 전처리기​

get_config로 내부 접근​

set_config로 설정 변경​

커스텀 튜닝 그리드​

커스텀 교차 검증​

모델 래퍼 만들기​

후처리 함수 만들기​

배치 처리 유틸리티​

플러그인 구조 만들기​

콜백 시스템​

환경 재사용​

정리​

다음 글 예고​

개요

실습 환경

커스텀 모델 추가

커스텀 메트릭 추가

커스텀 전처리기

get_config로 내부 접근

set_config로 설정 변경

커스텀 튜닝 그리드

커스텀 교차 검증

모델 래퍼 만들기

후처리 함수 만들기

배치 처리 유틸리티

플러그인 구조 만들기

콜백 시스템

환경 재사용

정리

다음 글 예고