from sklearn.decomposition import PCA
pca = PCA(n_components=50)
pca.fit(fruits_2d)
print(pca.components_.shape)
(50, 10000)
draw_fruits(pca.components_.reshape(-1, 100, 100))
fruits_pca = pca.transform(fruits_2d)
print(fruits_pca.shape)
(300, 50)
fruits_inverse = pca.inverse_transform(fruits_pca)
print(fruits_inverse.shape)
fruits_reconstruct = fruits_inverse.reshape(-1, 100, 100)
draw_fruits(fruits_reconstruct)
print(np.sum(pca.explained_variance_ratio_))
plt.plot(pca.explained_variance_ratio_)
0.9215462358239324
-> 10개 외의 특성은 크게 중요하지 않은 주성분
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
target = np.array([0]*100 + [1]*100 + [2]*100)
scores = cross_validate(lr, fruits_2d, target)
print(np.mean(scores['test_score']), np.mean(scores['fit_time']))
scores = cross_validate(lr, fruits_pca, target)
print(np.mean(scores['test_score']), np.mean(scores['fit_time']))
0.9966666666666667 1.1434682369232179 // 축소 전
1.0 0.03203639984130859 // 축소 후
pca = PCA(n_components=0.5)
pca.fit(fruits_2d)
print(pca.n_components_)
fruits_pca = pca.transform(fruits_2d)
print(fruits_pca.shape)
2 // 2개 특성만 있으면 50% 분산 가능
(300, 2)
scores = cross_validate(lr, fruits_pca, target)
print(np.mean(scores['test_score']), np.mean(scores['fit_time']))
0.99 0.04648008346557617
-> 2개 특성만 사용해도 좋은 분류기 훈련이 가능
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3, random_state=42)
km.fit(fruits_pca)
print(np.unique(km.labels_, return_counts=True))
(array([0, 1, 2], dtype=int32), array([110, 99, 91]))
for label in range(0, 3):
data = fruits_pca[km.labels_ == label]
plt.scatter(data[:, 0], data[:, 1])
plt.legend(['pineapple', 'apple', 'banana'])
plt.show()