068 Isolation Forest 상세

키워드: Isolation Forest, iforest

개요

Isolation Forest는 가장 널리 사용되는 이상치 탐지 알고리즘입니다. "이상치는 격리하기 쉽다"는 아이디어에 기반합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pycaret[full]>=3.0

Isolation Forest 원리

핵심 아이디어

정상 데이터: 많은 분할 필요 (밀집 영역)
이상치: 적은 분할로 격리 (희소 영역)

       전체 공간
    ┌─────────────────┐
    │  ●●●●●          │
    │ ●●●●●●●         │  정상: 여러 번 분할해야 격리
    │  ●●●●●●         │
    │   ●●●●          │
    │            ★    │  이상치: 한두 번만에 격리
    └─────────────────┘

알고리즘 과정

특성을 랜덤하게 선택
해당 특성의 범위 내에서 랜덤 분할점 선택
데이터가 격리될 때까지 반복
격리에 필요한 분할 횟수(경로 길이) 기록
여러 트리의 평균 경로 길이로 이상 점수 계산

import numpy as np
import matplotlib.pyplot as plt

# 068 격리 과정 시각화
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

np.random.seed(42)
X_normal = np.random.randn(100, 2) * 0.5 + [2, 2]
X_outlier = np.array([[5, 5]])
X = np.vstack([X_normal, X_outlier])

for idx, (ax, depth) in enumerate(zip(axes, [1, 2, 5])):
    ax.scatter(X_normal[:, 0], X_normal[:, 1], alpha=0.5, label='Normal')
    ax.scatter(X_outlier[:, 0], X_outlier[:, 1], c='red', s=200, marker='*', label='Outlier')

    # 랜덤 분할선 시뮬레이션
    np.random.seed(idx)
    for _ in range(depth):
        if np.random.rand() > 0.5:
            split = np.random.uniform(X[:, 0].min(), X[:, 0].max())
            ax.axvline(x=split, color='green', linestyle='--', alpha=0.5)
        else:
            split = np.random.uniform(X[:, 1].min(), X[:, 1].max())
            ax.axhline(y=split, color='green', linestyle='--', alpha=0.5)

    ax.set_title(f'Depth = {depth}')
    ax.legend()
    ax.set_xlim(-2, 7)
    ax.set_ylim(-2, 7)

plt.tight_layout()
plt.savefig('iforest_splits.png', dpi=150)

PyCaret에서 Isolation Forest

from pycaret.anomaly import *
from pycaret.datasets import get_data
import pandas as pd
import numpy as np

# 068 데이터 생성
np.random.seed(42)
n_normal = 1000
n_outliers = 50

normal = np.random.randn(n_normal, 3)
outliers = np.random.uniform(-5, 5, (n_outliers, 3))

data = pd.DataFrame(
    np.vstack([normal, outliers]),
    columns=['F1', 'F2', 'F3']
)

# 068 환경 설정
anomaly = setup(data, session_id=42, verbose=False)

# 068 Isolation Forest
iforest = create_model('iforest')

# 068 fraction 지정 (예상 이상치 비율)
iforest_5pct = create_model('iforest', fraction=0.05)

주요 하이퍼파라미터

from sklearn.ensemble import IsolationForest

# 068 n_estimators: 트리 개수 (기본 100)
iforest = IsolationForest(n_estimators=100)

# 068 max_samples: 각 트리 학습에 사용할 샘플 수 (기본 'auto')
iforest = IsolationForest(max_samples='auto')  # min(256, n_samples)
iforest = IsolationForest(max_samples=256)

# 068 max_features: 각 트리에서 사용할 특성 수 (기본 1.0)
iforest = IsolationForest(max_features=1.0)  # 모든 특성

# 068 contamination: 이상치 비율 (기본 'auto')
iforest = IsolationForest(contamination=0.05)  # 5%

# 068 bootstrap: 부트스트랩 샘플링 (기본 False)
iforest = IsolationForest(bootstrap=False)

이상 점수 (Anomaly Score)

from sklearn.ensemble import IsolationForest
import numpy as np
import matplotlib.pyplot as plt
from pycaret.anomaly import *

# 068 데이터 생성
np.random.seed(42)
normal = np.random.randn(200, 2)
outliers = np.random.uniform(-4, 4, (20, 2))
X = np.vstack([normal, outliers])

# 068 Isolation Forest
iforest = IsolationForest(contamination=0.1, random_state=42)
iforest.fit(X)

# 068 이상 점수 (score_samples: 음수, 낮을수록 이상)
scores = iforest.score_samples(X)

# 068 decision_function: score_samples - offset
decision = iforest.decision_function(X)

# 068 시각화
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 068 이상 점수 분포
axes[0].hist(scores, bins=30, edgecolor='black')
axes[0].axvline(x=np.percentile(scores, 10), color='red', linestyle='--',
               label='10th percentile')
axes[0].set_xlabel('Anomaly Score')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Anomaly Scores')
axes[0].legend()

# 2D 시각화
scatter = axes[1].scatter(X[:, 0], X[:, 1], c=scores, cmap='RdYlBu', alpha=0.7)
plt.colorbar(scatter, ax=axes[1], label='Anomaly Score')
axes[1].set_title('Anomaly Scores Visualization')

plt.tight_layout()
plt.savefig('iforest_scores.png', dpi=150)

경로 길이와 이상 점수 관계

import numpy as np

# 068 이상 점수 계산 공식
# 068 s(x, n) = 2^(-E(h(x)) / c(n))
# 068 - E(h(x)): 평균 경로 길이
# 068 - c(n): 정규화 상수

def c(n):
    """Average path length of unsuccessful search in BST"""
    if n > 2:
        return 2 * (np.log(n - 1) + 0.5772156649) - 2 * (n - 1) / n
    elif n == 2:
        return 1
    else:
        return 0

def anomaly_score(path_length, n_samples):
    """Calculate anomaly score from path length"""
    return 2 ** (-path_length / c(n_samples))

# 068 예시
n = 256  # 샘플 수
for path in [2, 5, 8, 12]:
    score = anomaly_score(path, n)
    print(f"경로 길이 {path}: 이상 점수 = {score:.4f}")

# 068 짧은 경로 → 높은 이상 점수 → 이상치

n_estimators 영향

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
import numpy as np
import matplotlib.pyplot as plt

# 068 데이터 생성
np.random.seed(42)
normal = np.random.randn(1000, 5)
outliers = np.random.uniform(-5, 5, (50, 5))
X = np.vstack([normal, outliers])
y_true = np.array([0] * 1000 + [1] * 50)

# 068 n_estimators에 따른 성능
n_trees_range = [10, 25, 50, 100, 200, 500]
f1_scores = []

for n_trees in n_trees_range:
    iforest = IsolationForest(n_estimators=n_trees, contamination=0.05, random_state=42)
    y_pred = iforest.fit_predict(X)
    y_pred = (y_pred == -1).astype(int)
    f1 = f1_score(y_true, y_pred)
    f1_scores.append(f1)

plt.figure(figsize=(10, 6))
plt.plot(n_trees_range, f1_scores, 'bo-', linewidth=2, markersize=10)
plt.xlabel('Number of Trees')
plt.ylabel('F1 Score')
plt.title('Effect of n_estimators on Isolation Forest')
plt.grid(True, alpha=0.3)
plt.savefig('iforest_n_trees.png', dpi=150)

# 068 보통 100개면 충분

max_samples 영향

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
import numpy as np

# 068 데이터 생성
np.random.seed(42)
normal = np.random.randn(10000, 5)  # 대용량
outliers = np.random.uniform(-5, 5, (500, 5))
X = np.vstack([normal, outliers])
y_true = np.array([0] * 10000 + [1] * 500)

# 068 max_samples에 따른 성능
samples_range = [64, 128, 256, 512, 1024, 'auto']
results = []

for samples in samples_range:
    iforest = IsolationForest(max_samples=samples, contamination=0.05, random_state=42)
    y_pred = iforest.fit_predict(X)
    y_pred = (y_pred == -1).astype(int)
    f1 = f1_score(y_true, y_pred)
    results.append({'max_samples': samples, 'F1': f1})

import pandas as pd
print(pd.DataFrame(results))

# 256이 기본값이며 대부분 충분

contamination 튜닝

from pycaret.anomaly import *
import pandas as pd
import numpy as np

# 068 데이터 생성
np.random.seed(42)
normal = np.random.randn(950, 3)
outliers = np.random.uniform(-5, 5, (50, 3))
data = pd.DataFrame(np.vstack([normal, outliers]), columns=['F1', 'F2', 'F3'])
y_true = np.array([0] * 950 + [1] * 50)

anomaly = setup(data, session_id=42, verbose=False)

# 068 다양한 contamination
from sklearn.metrics import f1_score, precision_score, recall_score

fractions = [0.01, 0.03, 0.05, 0.07, 0.1]
results = []

for frac in fractions:
    model = create_model('iforest', fraction=frac)
    result = assign_model(model)
    y_pred = result['Anomaly'].values

    results.append({
        'Fraction': frac,
        'Detected': y_pred.sum(),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1': f1_score(y_true, y_pred)
    })

df = pd.DataFrame(results)
print("Contamination 튜닝:")
print(df.round(4))

고차원 데이터에서의 성능

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
import numpy as np
import matplotlib.pyplot as plt

# 068 다양한 차원에서 테스트
dimensions = [2, 5, 10, 20, 50, 100]
f1_scores = []

for dim in dimensions:
    np.random.seed(42)
    normal = np.random.randn(1000, dim)
    outliers = np.random.uniform(-3, 3, (50, dim))
    X = np.vstack([normal, outliers])
    y_true = np.array([0] * 1000 + [1] * 50)

    iforest = IsolationForest(contamination=0.05, random_state=42)
    y_pred = iforest.fit_predict(X)
    y_pred = (y_pred == -1).astype(int)

    f1 = f1_score(y_true, y_pred)
    f1_scores.append(f1)

plt.figure(figsize=(10, 6))
plt.plot(dimensions, f1_scores, 'go-', linewidth=2, markersize=10)
plt.xlabel('Number of Dimensions')
plt.ylabel('F1 Score')
plt.title('Isolation Forest Performance vs Dimensionality')
plt.grid(True, alpha=0.3)
plt.savefig('iforest_dimensions.png', dpi=150)

# 068 고차원에서도 비교적 안정적 (차원의 저주에 강건)

장단점

장점:

빠른 학습 및 예측
고차원 데이터에 강건
메모리 효율적
해석 가능 (경로 길이)
이상치 비율 지정 가능

단점:

지역 밀도 고려 안 함
축에 평행한 분할만 가능
contamination 파라미터 필요

Extended Isolation Forest

축에 평행하지 않은 분할로 개선:

# 068 pip install eif
try:
    import eif

    # Extended Isolation Forest
    # 랜덤 hyperplane으로 분할 (축에 평행하지 않음)
    ext_iforest = eif.iForest(X, ntrees=100, sample_size=256)
    scores = ext_iforest.compute_paths(X)

    print("Extended IF 점수 범위:", scores.min(), "~", scores.max())

except ImportError:
    print("pip install eif 필요")

정리

Isolation Forest는 "격리하기 쉬운 데이터 = 이상치" 원리
경로 길이가 짧을수록 이상치일 가능성 높음
n_estimators=100, max_samples=256이 일반적
contamination으로 이상치 비율 조절
고차원에서도 효과적

다음 글 예고

다음 글에서는 One-Class SVM 상세를 다룹니다.

PyCaret 머신러닝 마스터 시리즈 #068

개요​

실습 환경​

Isolation Forest 원리​

핵심 아이디어​

알고리즘 과정​

PyCaret에서 Isolation Forest​

주요 하이퍼파라미터​

이상 점수 (Anomaly Score)​

경로 길이와 이상 점수 관계​

n_estimators 영향​

max_samples 영향​

contamination 튜닝​

고차원 데이터에서의 성능​

장단점​

Extended Isolation Forest​

정리​

다음 글 예고​

개요

실습 환경

Isolation Forest 원리

핵심 아이디어

알고리즘 과정

PyCaret에서 Isolation Forest

주요 하이퍼파라미터

이상 점수 (Anomaly Score)

경로 길이와 이상 점수 관계

n_estimators 영향

max_samples 영향

contamination 튜닝

고차원 데이터에서의 성능

장단점

Extended Isolation Forest

정리

다음 글 예고