from matplotlib.image import imread
image = imread('./data/flower.jpeg')
print(image.shape) # (331, 500, 3)
import matplotlib.pyplot as plt
plt.imshow(image);
segmented_img.astype(np.uint8)
이미지는 RGB로 dtype이 'uint8'여야 이미지가 보인다 → dtype 다르게 그냥 넣으면 오류 발생import numpy as np
from sklearn.cluster import KMeans
X = image.reshape(-1, 3) #RGB
kmeans = KMeans(n_clusters=8, random_state=13).fit(X)
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_img = segmented_img.reshape(image.shape)
segmented_img = segmented_img.astype(np.uint8)
plt.imshow(segmented_img)
segmented_imgs = []
n_colors = (10, 8, 6, 4, 2)
for n_clusters in n_colors:
kmeans = KMeans(n_clusters=8, random_state=13).fit(X)
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_img = segmented_img.reshape(image.shape)
segmented_img = segmented_img.astype(np.uint8)
segmented_imgs.append(segmented_img)
plt.figure(figsize=(10, 5))
plt.subplots_adjust(wspace=0.05, hspace=0.1)
plt.subplot(231)
plt.imshow(image)
plt.title('original image')
plt.axis('off')
for idx, n_clusters in enumerate(n_colors):
plt.subplot(232 + idx)
plt.imshow(segmented_imgs[idx])
plt.title('{} colors'.format(n_clusters))
plt.axis('off')
plt.show()
from sklearn.datasets import load_digits #MINIST
from sklearn.model_selection import train_test_split
X_digits, y_digits = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=13)
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=5000, random_state=13)
log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test) # 0.9622222222222222
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('kmeans', KMeans(n_clusters=50, random_state=13)),
('log_reg', LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=5000, random_state=13))
])
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test) # 0.9688888888888889
from sklearn.model_selection import GridSearchCV
param_grid = dict(kmeans__n_clusters=range(2,100))
grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=2)
grid_clf.fit(X_train, y_train)
print(grid_clf.best_params_) # {'kmeans__n_clusters': 82}
print(grid_clf.score(X_test, y_test)) # 0.9755555555555555
Reference
1) 제로베이스 데이터스쿨 강의자료