039 분류 실전 - 스팸 메일 분류

키워드: 스팸, 텍스트 분류

개요

스팸 메일 분류는 텍스트 분류의 대표적인 예입니다. 이 글에서는 텍스트 데이터를 전처리하고 PyCaret으로 스팸 분류 모델을 구축합니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: pycaret[full]>=3.0, nltk

프로젝트 개요

목표: 이메일/SMS가 스팸인지 정상인지 분류

비즈니스 가치:

불필요한 메일 자동 필터링
사용자 경험 향상
보안 위협 차단

데이터 로드

import pandas as pd
import numpy as np

# 039 UCI SMS Spam Collection 데이터
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
data = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

print(f"데이터 크기: {data.shape}")
print(f"\n클래스 분포:")
print(data['label'].value_counts(normalize=True))
print(f"\n샘플 메시지:")
print(data.head())

텍스트 전처리

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# 039 필요한 NLTK 데이터 다운로드
nltk.download('stopwords', quiet=True)

# 039 전처리 함수
def preprocess_text(text):
    # 소문자 변환
    text = text.lower()

    # 특수문자 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 토큰화
    tokens = text.split()

    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]

    # 스테밍
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]

    return ' '.join(tokens)

# 039 전처리 적용
data['processed'] = data['message'].apply(preprocess_text)

print("전처리 전후 비교:")
print(f"원본: {data['message'].iloc[0]}")
print(f"처리: {data['processed'].iloc[0]}")

특성 추출 (TF-IDF)

from sklearn.feature_extraction.text import TfidfVectorizer

# 039 TF-IDF 벡터화
tfidf = TfidfVectorizer(max_features=3000)
X_tfidf = tfidf.fit_transform(data['processed'])

# 039 DataFrame으로 변환
feature_names = tfidf.get_feature_names_out()
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=feature_names)
df_tfidf['label'] = data['label'].values

print(f"TF-IDF 특성 수: {len(feature_names)}")
print(f"최종 데이터 크기: {df_tfidf.shape}")

PyCaret 설정

from pycaret.classification import *

# 039 환경 설정
clf = setup(
    data=df_tfidf,
    target='label',
    session_id=42,
    verbose=False
)

모델 비교

# 039 스팸 분류에 효과적인 알고리즘 비교
print("=== 모델 비교 ===")
best_models = compare_models(
    include=['nb', 'lr', 'rf', 'xgboost', 'lightgbm'],
    sort='F1',
    n_select=3
)

Naive Bayes 상세 분석

# 039 Naive Bayes는 텍스트 분류에 특히 효과적
print("\n=== Naive Bayes 분석 ===")
nb = create_model('nb')

# 039 평가
plot_model(nb, plot='confusion_matrix', save=True)
plot_model(nb, plot='class_report', save=True)

로지스틱 회귀 분석

# 039 로지스틱 회귀로 중요 단어 파악
print("\n=== 로지스틱 회귀 분석 ===")
lr = create_model('lr')

# 039 계수 분석
import matplotlib.pyplot as plt

# 039 모델 계수 추출
coefficients = lr.coef_[0]
top_indices = np.argsort(coefficients)

# 039 스팸 관련 상위 단어
top_spam_idx = top_indices[-15:]
top_spam_words = [feature_names[i] for i in top_spam_idx]
top_spam_coefs = [coefficients[i] for i in top_spam_idx]

# 039 정상 관련 상위 단어
top_ham_idx = top_indices[:15]
top_ham_words = [feature_names[i] for i in top_ham_idx]
top_ham_coefs = [coefficients[i] for i in top_ham_idx]

# 039 시각화
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].barh(top_spam_words, top_spam_coefs, color='red')
axes[0].set_title('Top Spam Indicator Words')
axes[0].set_xlabel('Coefficient')

axes[1].barh(top_ham_words, top_ham_coefs, color='green')
axes[1].set_title('Top Ham Indicator Words')
axes[1].set_xlabel('Coefficient')

plt.tight_layout()
plt.savefig('spam_word_analysis.png', dpi=150)

모델 튜닝

# 039 최고 모델 튜닝
print("\n=== 모델 튜닝 ===")
tuned_model = tune_model(best_models[0], optimize='F1')

앙상블

# 039 앙상블
print("\n=== 앙상블 ===")
blended = blend_models(best_models)

비즈니스 임계값 조정

스팸 필터에서는 정상 메일을 스팸으로 오판(FP)하면 안 됨:

# 039 예측 결과
predictions = predict_model(blended)

# 039 Precision 중시 (FP 최소화)
from sklearn.metrics import precision_recall_curve

y_true = (predictions['label'] == 'spam').astype(int)
y_score = predictions['prediction_score']

precision, recall, thresholds = precision_recall_curve(y_true, y_score)

# 039 Precision 95% 이상인 임계값 찾기
target_precision = 0.95
valid_idx = precision[:-1] >= target_precision
if valid_idx.any():
    optimal_threshold = thresholds[valid_idx][0]
    print(f"Precision {target_precision*100}% 달성 임계값: {optimal_threshold:.4f}")
else:
    print("목표 Precision 달성 불가")

최종 모델 저장

# 039 최종화
final_model = finalize_model(blended)

# 039 저장
save_model(final_model, 'spam_classifier')

# 039 TF-IDF 벡터라이저도 저장
import joblib
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

print("모델 및 벡터라이저 저장 완료")

새 메시지 예측

def predict_spam(message):
    # 전처리
    processed = preprocess_text(message)

    # TF-IDF 변환
    tfidf_loaded = joblib.load('tfidf_vectorizer.pkl')
    features = tfidf_loaded.transform([processed])
    df_features = pd.DataFrame(features.toarray(), columns=tfidf_loaded.get_feature_names_out())

    # 모델 로드 및 예측
    model = load_model('spam_classifier')
    prediction = predict_model(model, data=df_features)

    return prediction['prediction_label'].values[0], prediction['prediction_score'].values[0]

# 039 테스트
test_messages = [
    "Congratulations! You've won a free iPhone. Click here to claim.",
    "Hey, are we still meeting for lunch tomorrow?",
    "URGENT: Your account has been compromised. Click to verify.",
    "Can you send me the project files?"
]

print("=== 새 메시지 예측 ===")
for msg in test_messages:
    label, score = predict_spam(msg)
    print(f"\n메시지: {msg[:50]}...")
    print(f"예측: {label} (확률: {score:.2%})")

성능 개선 팁

# 1. N-gram 추가
tfidf_ngram = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# 2. 더 많은 특성
tfidf_more = TfidfVectorizer(max_features=10000)

# 3. 불균형 처리
clf = setup(
    data=df_tfidf,
    target='label',
    fix_imbalance=True,
    session_id=42
)

# 4. 커스텀 특성 추가
# 039 - 메시지 길이
# 039 - 대문자 비율
# 039 - 특수문자 개수
# 039 - URL 포함 여부

정리

텍스트 분류는 전처리 → 벡터화 → 모델링 단계
TF-IDF로 텍스트를 수치 벡터로 변환
Naive Bayes가 텍스트 분류에 효과적
스팸 필터는 Precision 중시 (정상을 스팸으로 오판하면 안 됨)
중요 단어 분석으로 모델 해석 가능

다음 글 예고

다음 글에서는 분류 실전 - 질병 진단 예측을 다룹니다.

PyCaret 머신러닝 마스터 시리즈 #039

개요​

실습 환경​

프로젝트 개요​

데이터 로드​

텍스트 전처리​

특성 추출 (TF-IDF)​

PyCaret 설정​

모델 비교​

Naive Bayes 상세 분석​

로지스틱 회귀 분석​

모델 튜닝​

앙상블​

비즈니스 임계값 조정​

최종 모델 저장​

새 메시지 예측​

성능 개선 팁​

정리​

다음 글 예고​

개요