086 종합 프로젝트 - 텍스트 분류 (TF-IDF + FLAML)

키워드: 텍스트, TF-IDF, 자연어 처리, NLP

개요

텍스트 데이터를 TF-IDF로 벡터화하고 FLAML AutoML로 분류 모델을 구축합니다. 스팸 필터링, 감성 분석, 문서 분류 등에 적용할 수 있는 기본 NLP 파이프라인을 알아봅니다.

실습 환경

Python 버전: 3.11 권장
필요 패키지: flaml[automl]

pip install flaml[automl] pandas numpy scikit-learn

TF-IDF 개념

import numpy as np
import pandas as pd

tfidf_concept = {
    '요소': ['TF (Term Frequency)', 'IDF (Inverse Document Frequency)', 'TF-IDF'],
    '의미': [
        '문서 내 단어 빈도',
        '전체 문서에서 희귀도',
        'TF × IDF'
    ],
    '특징': [
        '자주 등장하면 높음',
        '희귀 단어일수록 높음',
        '중요한 단어 강조'
    ]
}

print("TF-IDF 개념:")
print(pd.DataFrame(tfidf_concept).to_string(index=False))

데이터 생성 (뉴스 분류)

np.random.seed(42)

# 086 카테고리별 키워드
category_keywords = {
    'sports': ['game', 'team', 'player', 'score', 'win', 'match', 'championship', 'league', 'coach', 'season'],
    'technology': ['software', 'app', 'device', 'digital', 'data', 'computer', 'AI', 'cloud', 'security', 'internet'],
    'politics': ['government', 'election', 'vote', 'policy', 'president', 'congress', 'law', 'democrat', 'republican', 'bill'],
    'business': ['market', 'stock', 'company', 'profit', 'revenue', 'investment', 'CEO', 'growth', 'trade', 'economy']
}

# 086 공통 단어
common_words = ['the', 'is', 'a', 'in', 'to', 'of', 'and', 'for', 'on', 'with', 'that', 'this', 'new', 'more', 'said']

def generate_document(category, n_words=50):
    """카테고리 기반 문서 생성"""
    keywords = category_keywords[category]
    words = []

    for _ in range(n_words):
        if np.random.random() < 0.4:  # 40% 키워드
            words.append(np.random.choice(keywords))
        else:  # 60% 공통 단어
            words.append(np.random.choice(common_words))

    return ' '.join(words)

# 086 데이터셋 생성
n_samples_per_class = 500
documents = []
labels = []

for category in category_keywords.keys():
    for _ in range(n_samples_per_class):
        doc = generate_document(category)
        documents.append(doc)
        labels.append(category)

df = pd.DataFrame({'text': documents, 'category': labels})
print(f"데이터 크기: {len(df)}")
print(f"카테고리 분포:\n{df['category'].value_counts()}")
print(f"\n샘플 문서 (sports):")
print(df[df['category'] == 'sports']['text'].iloc[0][:100])

TF-IDF 벡터화

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# 086 데이터 분할
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['text'], df['category'],
    test_size=0.2, random_state=42, stratify=df['category']
)

# 086 TF-IDF 벡터화
tfidf = TfidfVectorizer(
    max_features=1000,      # 최대 특성 수
    min_df=2,               # 최소 문서 빈도
    max_df=0.95,            # 최대 문서 빈도
    ngram_range=(1, 2),     # 유니그램 + 바이그램
    stop_words='english'    # 불용어 제거
)

X_train = tfidf.fit_transform(X_train_text).toarray()
X_test = tfidf.transform(X_test_text).toarray()

print(f"TF-IDF 특성 수: {X_train.shape[1]}")
print(f"학습 데이터: {X_train.shape}")
print(f"테스트 데이터: {X_test.shape}")

# 086 상위 TF-IDF 단어 확인
feature_names = tfidf.get_feature_names_out()
print(f"\n상위 20개 특성: {feature_names[:20].tolist()}")

FLAML로 텍스트 분류

from flaml import AutoML
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# 086 레이블 인코딩
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# 086 FLAML AutoML
automl = AutoML()
automl.fit(
    X_train, y_train_encoded,
    task="classification",
    metric="accuracy",
    time_budget=60,
    estimator_list=['lgbm', 'xgboost', 'rf', 'extra_tree'],
    n_jobs=-1,
    seed=42,
    verbose=2
)

print(f"\n=== FLAML 결과 ===")
print(f"최적 모델: {automl.best_estimator}")
print(f"최적 설정: {automl.best_config}")

모델 평가

# 086 예측
y_pred = automl.predict(X_test)

# 086 성능 평가
print("\n=== 분류 성능 ===")
print(f"정확도: {accuracy_score(y_test_encoded, y_pred):.4f}")
print("\n분류 리포트:")
print(classification_report(y_test_encoded, y_pred, target_names=le.classes_))

혼동 행렬 시각화

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

# 086 혼동 행렬
cm = confusion_matrix(y_test_encoded, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('예측')
plt.ylabel('실제')
plt.title('텍스트 분류 혼동 행렬')
plt.tight_layout()
plt.show()

특성 중요도 분석

# 086 모델의 특성 중요도
if hasattr(automl.model.estimator, 'feature_importances_'):
    importances = automl.model.estimator.feature_importances_

    # 상위 중요 단어
    top_indices = np.argsort(importances)[-20:]
    top_words = feature_names[top_indices]
    top_importances = importances[top_indices]

    print("\n=== 상위 20개 중요 단어 ===")
    for word, imp in zip(reversed(top_words), reversed(top_importances)):
        print(f"  {word}: {imp:.4f}")

    # 시각화
    plt.figure(figsize=(10, 8))
    plt.barh(top_words, top_importances)
    plt.xlabel('중요도')
    plt.title('텍스트 분류 중요 단어')
    plt.tight_layout()
    plt.show()

카테고리별 핵심 단어

def get_category_keywords(tfidf, X_train, y_train, feature_names, top_n=10):
    """카테고리별 핵심 단어 추출"""
    keywords = {}

    for category in np.unique(y_train):
        # 해당 카테고리 문서
        category_mask = (y_train == category)
        category_tfidf = X_train[category_mask].mean(axis=0)

        # 상위 단어
        top_indices = np.argsort(category_tfidf)[-top_n:]
        top_words = [(feature_names[i], category_tfidf[i]) for i in reversed(top_indices)]
        keywords[le.classes_[category]] = top_words

    return keywords

# 086 카테고리별 키워드
category_keywords_result = get_category_keywords(
    tfidf, X_train, y_train_encoded, feature_names, top_n=10
)

print("\n=== 카테고리별 핵심 단어 ===")
for category, words in category_keywords_result.items():
    word_list = [w[0] for w in words[:5]]
    print(f"{category}: {', '.join(word_list)}")

새 텍스트 예측

def predict_category(text, tfidf, model, label_encoder):
    """새 텍스트의 카테고리 예측"""
    # TF-IDF 변환
    features = tfidf.transform([text]).toarray()

    # 예측
    pred = model.predict(features)[0]
    proba = model.predict_proba(features)[0]

    # 결과
    category = label_encoder.classes_[pred]
    confidence = proba[pred]

    return category, confidence, dict(zip(label_encoder.classes_, proba))

# 086 테스트 문장들
test_sentences = [
    "The team scored a winning goal in the championship match",
    "New AI software improves data security in cloud computing",
    "President signs new bill after congress vote",
    "Stock market shows growth as company profits increase"
]

print("\n=== 새 텍스트 예측 ===")
for sentence in test_sentences:
    category, confidence, proba = predict_category(sentence, tfidf, automl, le)
    print(f"\n텍스트: {sentence[:50]}...")
    print(f"예측: {category} (신뢰도: {confidence:.2%})")

다양한 벡터화 비교

from sklearn.feature_extraction.text import CountVectorizer

# 086 다양한 벡터화 방법
vectorizers = {
    'TF-IDF (1-gram)': TfidfVectorizer(max_features=500, ngram_range=(1, 1)),
    'TF-IDF (1,2-gram)': TfidfVectorizer(max_features=500, ngram_range=(1, 2)),
    'Count (1-gram)': CountVectorizer(max_features=500, ngram_range=(1, 1)),
}

results = []

for name, vec in vectorizers.items():
    X_tr = vec.fit_transform(X_train_text).toarray()
    X_te = vec.transform(X_test_text).toarray()

    model = AutoML()
    model.fit(X_tr, y_train_encoded, task="classification",
              time_budget=30, verbose=0)

    y_pred = model.predict(X_te)
    acc = accuracy_score(y_test_encoded, y_pred)

    results.append({'벡터화': name, '정확도': acc, '모델': model.best_estimator})
    print(f"{name}: {acc:.4f}")

print("\n=== 벡터화 방법 비교 ===")
print(pd.DataFrame(results).to_string(index=False))

파이프라인 구축

from sklearn.pipeline import Pipeline

# 086 전체 파이프라인
class TextClassificationPipeline:
    def __init__(self, max_features=1000, time_budget=60):
        self.max_features = max_features
        self.time_budget = time_budget
        self.tfidf = None
        self.model = None
        self.label_encoder = None

    def fit(self, texts, labels):
        """학습"""
        # TF-IDF 벡터화
        self.tfidf = TfidfVectorizer(
            max_features=self.max_features,
            ngram_range=(1, 2),
            stop_words='english'
        )
        X = self.tfidf.fit_transform(texts).toarray()

        # 레이블 인코딩
        self.label_encoder = LabelEncoder()
        y = self.label_encoder.fit_transform(labels)

        # FLAML 학습
        self.model = AutoML()
        self.model.fit(X, y, task="classification",
                      time_budget=self.time_budget, verbose=0)

        return self

    def predict(self, texts):
        """예측"""
        X = self.tfidf.transform(texts).toarray()
        y_pred = self.model.predict(X)
        return self.label_encoder.inverse_transform(y_pred)

    def predict_proba(self, texts):
        """확률 예측"""
        X = self.tfidf.transform(texts).toarray()
        return self.model.predict_proba(X)

# 086 파이프라인 사용
pipeline = TextClassificationPipeline(max_features=500, time_budget=30)
pipeline.fit(X_train_text, y_train)

# 086 테스트
predictions = pipeline.predict(X_test_text[:5])
print("\n=== 파이프라인 테스트 ===")
for text, pred, actual in zip(X_test_text[:5], predictions, y_test[:5]):
    print(f"예측: {pred}, 실제: {actual}")

모델 저장

import joblib

# 086 파이프라인 저장
joblib.dump({
    'tfidf': tfidf,
    'model': automl,
    'label_encoder': le
}, 'text_classifier.pkl')

print("\n텍스트 분류 모델 저장: text_classifier.pkl")

정리

TF-IDF: 텍스트를 수치 벡터로 변환
n-gram: 유니그램 + 바이그램으로 문맥 포착
FLAML: 자동 모델 선택 및 하이퍼파라미터 튜닝
특성 중요도: 분류에 중요한 단어 확인
파이프라인: 전처리부터 예측까지 자동화

다음 글 예고

다음 글에서는 A/B 테스트 분석 자동화 프로젝트를 진행합니다. FLAML을 활용한 실험 분석 자동화 방법을 알아봅니다.

FLAML AutoML 마스터 시리즈 #086

개요​

실습 환경​

TF-IDF 개념​

데이터 생성 (뉴스 분류)​

TF-IDF 벡터화​

FLAML로 텍스트 분류​

모델 평가​

혼동 행렬 시각화​

특성 중요도 분석​

카테고리별 핵심 단어​

새 텍스트 예측​

다양한 벡터화 비교​

파이프라인 구축​

모델 저장​

정리​

다음 글 예고​

개요