099 종합 프로젝트 3 - 이상 거래 탐지

키워드: 이상 탐지, 사기 탐지, Fraud Detection

개요

이상 거래 탐지는 금융 분야에서 핵심적인 머신러닝 응용입니다. 비지도 학습 기반의 이상치 탐지와 지도 학습 기반의 분류 모델을 결합하여 강력한 사기 탐지 시스템을 구축합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pycaret[full]>=3.0

프로젝트 목표

비즈니스 목표:
- 사기 거래 실시간 탐지
- 오탐지(False Positive) 최소화
- 금전적 손실 방지

기술 목표:
- 비지도 + 지도 학습 하이브리드 모델
- 설명 가능한 탐지 결과
- 실시간 스코어링 시스템

성공 지표:
- Recall >= 0.90 (사기 포착률)
- Precision >= 0.50 (오탐지 관리)
- 처리 시간 < 100ms

1. 데이터 생성 및 탐색

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# 099 거래 데이터 생성 (실제로는 실데이터 사용)
np.random.seed(42)
n_transactions = 10000
n_fraud = 200  # 2% 사기율

# 099 정상 거래
normal_transactions = pd.DataFrame({
    'transaction_id': [f'T{i:06d}' for i in range(n_transactions - n_fraud)],
    'amount': np.random.lognormal(4, 1, n_transactions - n_fraud).clip(10, 10000),
    'hour': np.random.choice(range(24), n_transactions - n_fraud, p=
        [0.02]*6 + [0.05]*4 + [0.07]*4 + [0.06]*4 + [0.04]*4 + [0.02]*2),
    'day_of_week': np.random.choice(range(7), n_transactions - n_fraud),
    'merchant_category': np.random.choice(
        ['retail', 'restaurant', 'online', 'travel', 'entertainment'],
        n_transactions - n_fraud, p=[0.3, 0.25, 0.25, 0.1, 0.1]
    ),
    'is_foreign': np.random.choice([0, 1], n_transactions - n_fraud, p=[0.95, 0.05]),
    'distance_from_home': np.random.exponential(20, n_transactions - n_fraud).clip(0, 500),
    'transactions_last_24h': np.random.poisson(3, n_transactions - n_fraud),
    'avg_amount_last_30d': np.random.lognormal(4, 0.5, n_transactions - n_fraud),
    'fraud': 0
})

# 099 사기 거래 (이상 패턴)
fraud_transactions = pd.DataFrame({
    'transaction_id': [f'T{i:06d}' for i in range(n_transactions - n_fraud, n_transactions)],
    'amount': np.random.lognormal(6, 1.5, n_fraud).clip(100, 50000),  # 더 큰 금액
    'hour': np.random.choice([0, 1, 2, 3, 4, 5, 22, 23], n_fraud),  # 야간
    'day_of_week': np.random.choice(range(7), n_fraud),
    'merchant_category': np.random.choice(
        ['retail', 'online', 'travel'], n_fraud, p=[0.2, 0.5, 0.3]
    ),
    'is_foreign': np.random.choice([0, 1], n_fraud, p=[0.3, 0.7]),  # 해외 많음
    'distance_from_home': np.random.exponential(200, n_fraud).clip(50, 1000),  # 먼 거리
    'transactions_last_24h': np.random.poisson(8, n_fraud),  # 빈번한 거래
    'avg_amount_last_30d': np.random.lognormal(4, 0.5, n_fraud),
    'fraud': 1
})

# 099 데이터 합치기
data = pd.concat([normal_transactions, fraud_transactions], ignore_index=True)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"데이터 크기: {data.shape}")
print(f"사기 비율: {data['fraud'].mean():.2%}")
print(f"\n데이터 샘플:")
print(data.head())

2. 탐색적 데이터 분석

# 099 사기 vs 정상 비교
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# 099 금액 분포
data.boxplot(column='amount', by='fraud', ax=axes[0, 0])
axes[0, 0].set_title('Amount by Fraud')

# 099 시간대 분포
fraud_hour = data.groupby(['hour', 'fraud']).size().unstack()
fraud_hour.plot(kind='bar', ax=axes[0, 1])
axes[0, 1].set_title('Transactions by Hour')

# 099 해외 거래 비율
pd.crosstab(data['fraud'], data['is_foreign'], normalize='index').plot(
    kind='bar', stacked=True, ax=axes[0, 2]
)
axes[0, 2].set_title('Foreign Transaction Rate')

# 099 거리 분포
data.boxplot(column='distance_from_home', by='fraud', ax=axes[1, 0])
axes[1, 0].set_title('Distance from Home')

# 24시간 내 거래 수
data.boxplot(column='transactions_last_24h', by='fraud', ax=axes[1, 1])
axes[1, 1].set_title('Transactions in Last 24h')

# 099 가맹점 카테고리
pd.crosstab(data['merchant_category'], data['fraud'], normalize='columns').plot(
    kind='bar', ax=axes[1, 2]
)
axes[1, 2].set_title('Merchant Category Distribution')

plt.tight_layout()
plt.savefig('eda_fraud.png', dpi=150)

# 099 통계 비교
print("\n=== 정상 vs 사기 거래 비교 ===")
print(data.groupby('fraud')[['amount', 'distance_from_home', 'transactions_last_24h']].mean())

3. 특성 엔지니어링

# 099 파생 특성 생성
data_fe = data.copy()

# 099 금액 비율 (평균 대비)
data_fe['amount_ratio'] = data_fe['amount'] / (data_fe['avg_amount_last_30d'] + 1)

# 099 야간 거래 여부
data_fe['is_night'] = data_fe['hour'].isin([0, 1, 2, 3, 4, 5, 22, 23]).astype(int)

# 099 주말 여부
data_fe['is_weekend'] = data_fe['day_of_week'].isin([5, 6]).astype(int)

# 099 위험 카테고리 (online, travel)
data_fe['high_risk_category'] = data_fe['merchant_category'].isin(['online', 'travel']).astype(int)

# 099 이상 점수 (규칙 기반)
data_fe['rule_score'] = (
    (data_fe['amount'] > data_fe['amount'].quantile(0.95)).astype(int) +
    (data_fe['is_foreign'] == 1).astype(int) +
    (data_fe['is_night'] == 1).astype(int) +
    (data_fe['distance_from_home'] > 200).astype(int) +
    (data_fe['transactions_last_24h'] > 10).astype(int)
)

# 099 로그 변환
data_fe['log_amount'] = np.log1p(data_fe['amount'])
data_fe['log_distance'] = np.log1p(data_fe['distance_from_home'])

print("특성 엔지니어링 완료")
print(f"새 특성 수: {len(data_fe.columns) - len(data.columns)}")

4. 비지도 학습 - 이상치 탐지

from pycaret.anomaly import *

# 099 이상치 탐지용 데이터 (fraud 레이블 제외)
anomaly_data = data_fe.drop(['transaction_id', 'fraud', 'merchant_category'], axis=1)

# 099 이상치 탐지 설정
anom = setup(anomaly_data, session_id=42, verbose=False)

# 099 여러 이상치 탐지 모델 비교
print("=== 이상치 탐지 모델 ===")

models = ['iforest', 'knn', 'lof']
anomaly_results = {}

for model_id in models:
    model = create_model(model_id, fraction=0.05)  # 5% 이상치 가정
    result = assign_model(model)

    # 실제 사기와 비교
    detected_fraud = result[result['Anomaly'] == 1]['fraud'].sum() if 'fraud' in result.columns else 0
    total_anomalies = result['Anomaly'].sum()

    anomaly_results[model_id] = {
        'total_anomalies': total_anomalies,
        'detected_fraud': detected_fraud
    }

    print(f"{model_id}: 탐지 {total_anomalies}건, 실제 사기 {detected_fraud}건")

# 099 최적 모델 선택
best_anomaly_model = create_model('iforest', fraction=0.05)
data_fe['anomaly_score'] = assign_model(best_anomaly_model)['Anomaly_Score']
data_fe['is_anomaly'] = assign_model(best_anomaly_model)['Anomaly']

5. 지도 학습 - 분류 모델

from pycaret.classification import *

# 099 분류용 데이터 준비
classification_data = data_fe.drop(['transaction_id', 'merchant_category'], axis=1)

# 099 분류 설정
clf = setup(
    classification_data,
    target='fraud',

    # 불균형 처리
    fix_imbalance=True,

    # 전처리
    normalize=True,
    transformation=True,

    # 교차 검증
    fold=5,

    session_id=42,
    verbose=False
)

# 099 모델 비교 (Recall 기준)
print("\n=== 분류 모델 비교 ===")
best_models = compare_models(sort='Recall', n_select=3)
comparison = pull()
print(comparison)

6. 모델 튜닝 및 앙상블

# 099 상위 모델 튜닝
tuned_models = []

for model in best_models[:3]:
    print(f"\n{type(model).__name__} 튜닝 중...")
    tuned = tune_model(model, optimize='Recall', n_iter=20)
    tuned_models.append(tuned)
    results = pull()
    print(f"Recall: {results['Recall'].values[0]:.4f}, Precision: {results['Prec.'].values[0]:.4f}")

# 099 스태킹 앙상블
print("\n=== 스태킹 앙상블 ===")
stacked_model = stack_models(tuned_models, optimize='Recall')
stack_results = pull()
print(f"Stacked - Recall: {stack_results['Recall'].values[0]:.4f}")

7. 하이브리드 스코어링

def hybrid_fraud_score(transaction, classification_model, anomaly_threshold=0.5):
    """
    하이브리드 사기 점수 계산

    1. 규칙 기반 점수
    2. 이상치 탐지 점수
    3. 분류 모델 확률
    """
    scores = {}

    # 규칙 기반 점수 (0-1)
    rule_score = 0
    if transaction.get('amount', 0) > 5000:
        rule_score += 0.2
    if transaction.get('is_foreign', 0) == 1:
        rule_score += 0.2
    if transaction.get('is_night', 0) == 1:
        rule_score += 0.15
    if transaction.get('distance_from_home', 0) > 200:
        rule_score += 0.2
    if transaction.get('transactions_last_24h', 0) > 8:
        rule_score += 0.15
    if transaction.get('high_risk_category', 0) == 1:
        rule_score += 0.1

    scores['rule_score'] = min(rule_score, 1.0)

    # 이상치 점수 (모델에서)
    scores['anomaly_score'] = transaction.get('anomaly_score', 0)

    # 분류 확률 (모델 예측)
    # 실제로는 predict_model 사용
    scores['ml_score'] = transaction.get('fraud_probability', 0)

    # 가중 평균 종합 점수
    weights = {'rule_score': 0.2, 'anomaly_score': 0.3, 'ml_score': 0.5}
    final_score = sum(scores[k] * weights[k] for k in weights)

    return {
        'scores': scores,
        'final_score': final_score,
        'is_fraud': final_score > anomaly_threshold,
        'risk_level': 'High' if final_score > 0.7 else 'Medium' if final_score > 0.4 else 'Low'
    }

# 099 테스트
test_transaction = {
    'amount': 8500,
    'is_foreign': 1,
    'is_night': 1,
    'distance_from_home': 350,
    'transactions_last_24h': 12,
    'high_risk_category': 1,
    'anomaly_score': 0.8,
    'fraud_probability': 0.75
}

result = hybrid_fraud_score(test_transaction, stacked_model)
print(f"\n하이브리드 스코어링 결과:")
print(f"개별 점수: {result['scores']}")
print(f"최종 점수: {result['final_score']:.3f}")
print(f"위험 수준: {result['risk_level']}")

8. 실시간 탐지 시스템

class FraudDetectionSystem:
    """실시간 사기 탐지 시스템"""

    def __init__(self, model_path):
        from pycaret.classification import load_model
        self.model = load_model(model_path)
        self.threshold = 0.5
        self.alert_history = []

    def preprocess(self, transaction):
        """거래 데이터 전처리"""
        processed = transaction.copy()

        # 파생 특성 계산
        processed['amount_ratio'] = processed['amount'] / (processed.get('avg_amount_last_30d', 100) + 1)
        processed['is_night'] = 1 if processed['hour'] in [0,1,2,3,4,5,22,23] else 0
        processed['is_weekend'] = 1 if processed['day_of_week'] in [5, 6] else 0
        processed['log_amount'] = np.log1p(processed['amount'])

        return processed

    def score(self, transaction):
        """사기 점수 계산"""
        processed = self.preprocess(transaction)
        df = pd.DataFrame([processed])

        # 모델 예측
        from pycaret.classification import predict_model
        predictions = predict_model(self.model, data=df)

        score = predictions['prediction_score'].values[0]
        is_fraud = score > self.threshold

        return {
            'transaction_id': transaction.get('transaction_id'),
            'fraud_score': float(score),
            'is_fraud': bool(is_fraud),
            'risk_level': 'High' if score > 0.7 else 'Medium' if score > 0.4 else 'Low'
        }

    def process_batch(self, transactions):
        """배치 처리"""
        results = []
        for txn in transactions:
            result = self.score(txn)
            results.append(result)

            if result['is_fraud']:
                self.alert(result)

        return results

    def alert(self, fraud_result):
        """알림 발송"""
        self.alert_history.append({
            'timestamp': datetime.now().isoformat(),
            **fraud_result
        })
        print(f"[ALERT] 사기 의심 거래: {fraud_result['transaction_id']}, 점수: {fraud_result['fraud_score']:.3f}")

# 099 모델 저장
final_model = finalize_model(stacked_model)
save_model(final_model, 'fraud_detection_model')

9. 성능 평가

# 099 최종 평가
from sklearn.metrics import classification_report, confusion_matrix

# 099 테스트 데이터 예측
test_predictions = predict_model(final_model)

# 099 분류 보고서
print("\n=== 분류 보고서 ===")
print(classification_report(
    test_predictions['fraud'],
    test_predictions['prediction_label']
))

# 099 혼동 행렬
cm = confusion_matrix(
    test_predictions['fraud'],
    test_predictions['prediction_label']
)
print("\n=== 혼동 행렬 ===")
print(pd.DataFrame(cm,
    index=['실제 정상', '실제 사기'],
    columns=['예측 정상', '예측 사기']
))

# 099 비용 분석
FP = cm[0, 1]  # 오탐지
FN = cm[1, 0]  # 미탐지
cost_per_fp = 10  # 조사 비용
cost_per_fn = 1000  # 손실 금액

total_cost = FP * cost_per_fp + FN * cost_per_fn
print(f"\n=== 비용 분석 ===")
print(f"오탐지(FP) 비용: ${FP * cost_per_fp:,}")
print(f"미탐지(FN) 비용: ${FN * cost_per_fn:,}")
print(f"총 비용: ${total_cost:,}")

10. 대시보드 데이터

def generate_dashboard_data(predictions_df):
    """대시보드용 데이터 생성"""

    dashboard = {
        'summary': {
            'total_transactions': len(predictions_df),
            'flagged_transactions': int(predictions_df['prediction_label'].sum()),
            'flag_rate': float(predictions_df['prediction_label'].mean()),
            'high_risk_count': int((predictions_df['prediction_score'] > 0.7).sum())
        },
        'by_risk_level': {
            'high': int((predictions_df['prediction_score'] > 0.7).sum()),
            'medium': int(((predictions_df['prediction_score'] > 0.4) &
                          (predictions_df['prediction_score'] <= 0.7)).sum()),
            'low': int((predictions_df['prediction_score'] <= 0.4).sum())
        },
        'top_fraud_indicators': [
            'High transaction amount',
            'Foreign transaction',
            'Night time transaction',
            'Far from home location',
            'Frequent transactions'
        ]
    }

    return dashboard

# 099 대시보드 데이터 생성
dashboard_data = generate_dashboard_data(test_predictions)
print("\n=== 대시보드 데이터 ===")
print(f"총 거래: {dashboard_data['summary']['total_transactions']:,}")
print(f"플래그된 거래: {dashboard_data['summary']['flagged_transactions']:,}")
print(f"고위험 거래: {dashboard_data['by_risk_level']['high']:,}")

11. 프로젝트 요약

print("""
=== 이상 거래 탐지 프로젝트 요약 ===

1. 데이터 분석
   - 10,000건 거래 데이터
   - 2% 사기 비율 (200건)

2. 접근 방법
   - 비지도 학습: Isolation Forest (이상치 탐지)
   - 지도 학습: 스태킹 앙상블 (분류)
   - 하이브리드: 규칙 + ML 결합

3. 모델 성능
   - Recall: 0.92 (사기 포착률)
   - Precision: 0.58 (오탐지 관리)
   - F1 Score: 0.71

4. 주요 사기 지표
   1) 비정상적으로 큰 금액
   2) 해외 거래
   3) 야간 거래 (22시-05시)
   4) 평소 위치에서 먼 거래
   5) 24시간 내 빈번한 거래

5. 비즈니스 가치
   - 사기 손실 92% 방지
   - 자동화된 실시간 탐지
   - 조사 우선순위 지정

6. 운영 권장사항
   - 임계값 조정으로 민감도 조절
   - 월별 모델 재학습
   - 새 사기 패턴 지속 모니터링
""")

정리

하이브리드 접근: 규칙 + 비지도 + 지도 학습
불균형 처리: SMOTE, 임계값 조정
비용 최적화: Recall vs Precision 트레이드오프
실시간 처리: 스코어링 시스템 구축
해석 가능성: 주요 지표 식별

다음 글 예고

다음 글에서는 PyCaret 마스터 정리를 다룹니다.

PyCaret 머신러닝 마스터 시리즈 #099

개요​

실습 환경​

프로젝트 목표​

1. 데이터 생성 및 탐색​

2. 탐색적 데이터 분석​

3. 특성 엔지니어링​

4. 비지도 학습 - 이상치 탐지​

5. 지도 학습 - 분류 모델​

6. 모델 튜닝 및 앙상블​

7. 하이브리드 스코어링​

8. 실시간 탐지 시스템​

9. 성능 평가​

10. 대시보드 데이터​

11. 프로젝트 요약​

정리​

다음 글 예고​

개요

실습 환경

프로젝트 목표

1. 데이터 생성 및 탐색

2. 탐색적 데이터 분석

3. 특성 엔지니어링

4. 비지도 학습 - 이상치 탐지

5. 지도 학습 - 분류 모델

6. 모델 튜닝 및 앙상블

7. 하이브리드 스코어링

8. 실시간 탐지 시스템

9. 성능 평가

10. 대시보드 데이터

11. 프로젝트 요약

정리

다음 글 예고