본문으로 건너뛰기

062 클러스터링 평가와 시각화

키워드: 평가, 시각화

개요

클러스터링은 정답이 없는 비지도 학습이므로 평가가 까다롭습니다. 이 글에서는 클러스터링 결과를 평가하고 시각화하는 다양한 방법을 알아봅니다.

실습 환경

  • Python 버전: 3.11 권장
  • 필요 패키지: pycaret[full]>=3.0

내부 평가 지표 (정답 없음)

실루엣 점수 (Silhouette Score)

from pycaret.clustering import *
from pycaret.datasets import get_data
from sklearn.metrics import silhouette_score, silhouette_samples
import numpy as np

data = get_data('jewellery')
clust = setup(data, normalize=True, session_id=42, verbose=False)

kmeans = create_model('kmeans', num_clusters=4)
clustered = assign_model(kmeans)

X = get_config('X')
labels = clustered['Cluster'].values

# 062 전체 실루엣 점수
overall_score = silhouette_score(X, labels)
print(f"전체 실루엣 점수: {overall_score:.4f}")

# 062 클러스터별 실루엣 점수
sample_scores = silhouette_samples(X, labels)

for cluster in sorted(set(labels)):
cluster_scores = sample_scores[labels == cluster]
print(f"클러스터 {cluster}: {cluster_scores.mean():.4f} (n={len(cluster_scores)})")

칼린스키-하라바스 지수

from sklearn.metrics import calinski_harabasz_score

ch_score = calinski_harabasz_score(X, labels)
print(f"칼린스키-하라바스 지수: {ch_score:.2f}")

# 062 높을수록 좋음 (클러스터 간 분리도 높음)

데이비스-볼딘 지수

from sklearn.metrics import davies_bouldin_score

db_score = davies_bouldin_score(X, labels)
print(f"데이비스-볼딘 지수: {db_score:.4f}")

# 062 낮을수록 좋음 (클러스터 내 응집도 높음)

외부 평가 지표 (정답 있음)

정답 레이블이 있는 경우:

from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
import numpy as np

# 062 예시: 실제 레이블이 있는 경우
np.random.seed(42)
true_labels = np.random.randint(0, 4, len(labels)) # 가상의 정답

# 062 Adjusted Rand Index (ARI)
ari = adjusted_rand_score(true_labels, labels)
print(f"ARI: {ari:.4f}") # -1~1, 높을수록 좋음

# 062 Normalized Mutual Information (NMI)
nmi = normalized_mutual_info_score(true_labels, labels)
print(f"NMI: {nmi:.4f}") # 0~1, 높을수록 좋음

# 062 Homogeneity, Completeness, V-measure
h = homogeneity_score(true_labels, labels)
c = completeness_score(true_labels, labels)
v = v_measure_score(true_labels, labels)
print(f"Homogeneity: {h:.4f}")
print(f"Completeness: {c:.4f}")
print(f"V-measure: {v:.4f}")

PyCaret 시각화

from pycaret.clustering import *
from pycaret.datasets import get_data

data = get_data('jewellery')
clust = setup(data, normalize=True, session_id=42, verbose=False)

kmeans = create_model('kmeans', num_clusters=4)

# 1. 클러스터 플롯 (PCA 2D)
plot_model(kmeans, plot='cluster')

# 2. 실루엣 플롯
plot_model(kmeans, plot='silhouette')

# 3. 엘보우 플롯
plot_model(kmeans, plot='elbow')

# 4. 분포 플롯
plot_model(kmeans, plot='distribution')

# 5. 거리 플롯
plot_model(kmeans, plot='distance')

2D 시각화 (PCA)

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from pycaret.clustering import *
from pycaret.datasets import get_data

data = get_data('jewellery')
clust = setup(data, normalize=True, session_id=42, verbose=False)

kmeans = create_model('kmeans', num_clusters=4)
clustered = assign_model(kmeans)

X = get_config('X')

# 062 PCA로 2D 변환
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# 062 시각화
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1],
c=clustered['Cluster'],
cmap='viridis', alpha=0.7, s=50)

# 062 중심점 표시
centers_pca = pca.transform(kmeans.cluster_centers_)
plt.scatter(centers_pca[:, 0], centers_pca[:, 1],
c='red', marker='X', s=300, edgecolors='black', linewidth=2)

plt.colorbar(scatter, label='Cluster')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)')
plt.title('Clustering Results (PCA 2D)')
plt.savefig('clustering_pca.png', dpi=150)

print(f"설명된 분산: {sum(pca.explained_variance_ratio_)*100:.1f}%")

t-SNE 시각화

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from pycaret.clustering import *
from pycaret.datasets import get_data

data = get_data('jewellery')
clust = setup(data, normalize=True, session_id=42, verbose=False)

kmeans = create_model('kmeans', num_clusters=4)
clustered = assign_model(kmeans)

X = get_config('X')

# 062 t-SNE로 2D 변환
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X)

# 062 시각화
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1],
c=clustered['Cluster'],
cmap='viridis', alpha=0.7, s=50)

plt.colorbar(scatter, label='Cluster')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('Clustering Results (t-SNE 2D)')
plt.savefig('clustering_tsne.png', dpi=150)

특성별 클러스터 분포

import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.clustering import *
from pycaret.datasets import get_data

data = get_data('jewellery')
clust = setup(data, normalize=True, session_id=42, verbose=False)

kmeans = create_model('kmeans', num_clusters=4)
clustered = assign_model(kmeans)

# 062 원본 특성으로 박스플롯
features = data.columns.tolist()
n_features = len(features)

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for idx, feature in enumerate(features[:4]): # 처음 4개 특성
ax = axes[idx // 2, idx % 2]
clustered.boxplot(column=feature, by='Cluster', ax=ax)
ax.set_title(f'{feature} by Cluster')
ax.set_xlabel('Cluster')

plt.tight_layout()
plt.savefig('cluster_boxplots.png', dpi=150)

레이더 차트 (클러스터 프로파일)

import numpy as np
import matplotlib.pyplot as plt
from pycaret.clustering import *
from pycaret.datasets import get_data

data = get_data('jewellery')
clust = setup(data, normalize=True, session_id=42, verbose=False)

kmeans = create_model('kmeans', num_clusters=4)
clustered = assign_model(kmeans)

# 062 클러스터별 평균 (정규화된 값)
X = get_config('X')
clustered_X = X.copy()
clustered_X['Cluster'] = clustered['Cluster']
cluster_means = clustered_X.groupby('Cluster').mean()

# 062 레이더 차트
features = X.columns.tolist()
num_vars = len(features)

# 062 각도 계산
angles = [n / float(num_vars) * 2 * np.pi for n in range(num_vars)]
angles += angles[:1]

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))

colors = ['blue', 'green', 'red', 'purple']

for idx, cluster in enumerate(cluster_means.index):
values = cluster_means.loc[cluster].values.tolist()
values += values[:1]

ax.plot(angles, values, 'o-', linewidth=2, label=f'Cluster {cluster}', color=colors[idx])
ax.fill(angles, values, alpha=0.1, color=colors[idx])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(features)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
plt.title('Cluster Profiles (Radar Chart)', y=1.08)
plt.savefig('cluster_radar.png', dpi=150, bbox_inches='tight')

히트맵

import seaborn as sns
import matplotlib.pyplot as plt
from pycaret.clustering import *
from pycaret.datasets import get_data

data = get_data('jewellery')
clust = setup(data, normalize=True, session_id=42, verbose=False)

kmeans = create_model('kmeans', num_clusters=4)
clustered = assign_model(kmeans)

# 062 클러스터별 평균
X = get_config('X')
clustered_X = X.copy()
clustered_X['Cluster'] = clustered['Cluster']
cluster_means = clustered_X.groupby('Cluster').mean()

# 062 히트맵
plt.figure(figsize=(10, 6))
sns.heatmap(cluster_means, annot=True, fmt='.2f', cmap='RdYlBu_r',
center=0, linewidths=0.5)
plt.title('Cluster Means Heatmap')
plt.xlabel('Features')
plt.ylabel('Cluster')
plt.savefig('cluster_heatmap.png', dpi=150)

클러스터 품질 보고서

from pycaret.clustering import *
from pycaret.datasets import get_data
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import pandas as pd

data = get_data('jewellery')
clust = setup(data, normalize=True, session_id=42, verbose=False)

kmeans = create_model('kmeans', num_clusters=4)
clustered = assign_model(kmeans)

X = get_config('X')
labels = clustered['Cluster'].values

# 062 종합 보고서
print("=" * 50)
print(" 클러스터링 품질 보고서")
print("=" * 50)

print(f"\n모델: K-Means (K=4)")
print(f"데이터 크기: {len(X)} 샘플, {X.shape[1]} 특성")

print("\n--- 클러스터 분포 ---")
cluster_counts = clustered['Cluster'].value_counts().sort_index()
for cluster, count in cluster_counts.items():
print(f" 클러스터 {cluster}: {count} ({count/len(X)*100:.1f}%)")

print("\n--- 품질 지표 ---")
print(f" 실루엣 점수: {silhouette_score(X, labels):.4f} (높을수록 좋음, -1~1)")
print(f" 칼린스키-하라바스: {calinski_harabasz_score(X, labels):.2f} (높을수록 좋음)")
print(f" 데이비스-볼딘: {davies_bouldin_score(X, labels):.4f} (낮을수록 좋음)")

print("\n--- 클러스터 중심 ---")
centers = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centers.index = [f'Cluster {i}' for i in range(len(centers))]
print(centers.round(3))

print("\n" + "=" * 50)

여러 알고리즘 비교

from pycaret.clustering import *
from pycaret.datasets import get_data
from sklearn.metrics import silhouette_score
import pandas as pd

data = get_data('jewellery')
clust = setup(data, normalize=True, session_id=42, verbose=False)

X = get_config('X')

algorithms = ['kmeans', 'hclust', 'birch']
results = []

for algo in algorithms:
model = create_model(algo, num_clusters=4)
clustered = assign_model(model)
labels = clustered['Cluster'].values

score = silhouette_score(X, labels)
n_clusters = len(set(labels))

results.append({
'Algorithm': algo,
'Clusters': n_clusters,
'Silhouette': score
})

df = pd.DataFrame(results).sort_values('Silhouette', ascending=False)
print("알고리즘 비교:")
print(df.to_string(index=False))

정리

  • 내부 평가: 실루엣, 칼린스키-하라바스, 데이비스-볼딘
  • 외부 평가: ARI, NMI (정답 레이블 필요)
  • 2D 시각화: PCA, t-SNE
  • 프로파일: 박스플롯, 레이더 차트, 히트맵
  • 여러 지표를 종합하여 판단

다음 글 예고

다음 글에서는 클러스터링 실전 - 고객 세분화를 다룹니다.


PyCaret 머신러닝 마스터 시리즈 #062