동작 원리
1. K개의 중심점(centroid)를 부여
2. 각 데이터를 보고 중심점과 가장 가까운 데이터를 기준으로 레이블링
3. 각 그룹에 속한 데이터들의 평균 위치를 찾아 새로운 중심점으로 지정
4. 2~3을 반복하며 중심점 변화가 거의 없는 지점에서 종료
장점
단점
적절한 K 선택방법
from sklearn.preprocessing import scale
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
iris = load_iris()
iris.feature_names

# feature에서 단위 텍스트 제거
cols = [i[:-5] for i in iris.feature_names]
cols
# 결과: ['sepal length', 'sepal width', 'petal length', 'petal width']
iris_df = pd.DataFrame(data=iris.data, columns = cols)
iris_df.head()

# 두 개의 특성만 사용
feature = iris_df[['petal length','petal width']]
feature.head()
# 군집화
model = KMeans(n_clusters = 3)
model.fit(feature)
model.labels_

해당 값들은 clustering 결과에 의한 라벨링으로 실제 라벨과 다르다.
# 중심점
model.cluster_centers_

predict= pd.DataFrame(model.predict(feature), columns = ['cluster'])
feature = pd.concat([feature, predict], axis = 1)
feature.head(10)

centers = pd.DataFrame(model.cluster_centers_, columns=['petal length', 'petal width'])
center_x = centers['petal length']
center_y = centers['petal width']
plt.figure(figsize=(10,6))
plt.scatter(feature['petal length'], feature['petal width'],
c=feature['cluster'], alpha = 0.5
)
plt.scatter(center_x, center_y, s = 50, marker = 'D', c = 'r')
plt.show()

from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
iris = load_iris()
feature_names = ['sepal length', 'sepal width',' petal length','petal width']
iris_df = pd.DataFrame(data = iris.data, columns = feature_names)
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300, random_state = 23).fit(iris_df)
iris_df['cluster'] = kmeans.labels_
iris_df.head(100)

from sklearn.metrics import silhouette_samples, silhouette_score
avg_value = silhouette_score(iris.data, iris_df['cluster'])
score_values = silhouette_samples(iris.data, iris_df['cluster'])
print('avg_value', avg_value)
print('silhouette_samples() return 값의 shape', score_values.shape)

def visualize_silhouette(cluster_lists, X_features):
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import math
n_cols = len(cluster_lists)
fig, axs = plt.subplots(figsize=(4*n_cols, 4), nrows=1, ncols=n_cols)
for ind, n_cluster in enumerate(cluster_lists):
clusterer = KMeans(n_clusters = n_cluster, max_iter=500, random_state=0)
cluster_labels = clusterer.fit_predict(X_features)
sil_avg = silhouette_score(X_features, cluster_labels)
sil_values = silhouette_samples(X_features, cluster_labels)
y_lower = 10
axs[ind].set_title('Number of Cluster : '+ str(n_cluster)+'\n' \
'Silhouette Score :' + str(round(sil_avg,3)) )
axs[ind].set_xlabel("The silhouette coefficient values")
axs[ind].set_ylabel("Cluster label")
axs[ind].set_xlim([-0.1, 1])
axs[ind].set_ylim([0, len(X_features) + (n_cluster + 1) * 10])
axs[ind].set_yticks([]) # Clear the yaxis labels / ticks
axs[ind].set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
for i in range(n_cluster):
ith_cluster_sil_values = sil_values[cluster_labels==i]
ith_cluster_sil_values.sort()
size_cluster_i = ith_cluster_sil_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_cluster)
axs[ind].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_sil_values, \
facecolor=color, edgecolor=color, alpha=0.7)
axs[ind].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
axs[ind].axvline(x=sil_avg, color="red", linestyle="--")
visualize_silhouette(cluster_lists = [2,3,4], X_features = iris.data)
