Principal Component Analysis(PCA)
: 데이터 집합 내에 존재하는 각 데이터의 차이를 가장 잘 나타내 주는 요소를 찾아내는 방법변수 추출은 기존 변수를 조합해 새로운 변수를 만드는 기법으로 변수 선택과 구분해야한다.
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set_style('whitegrid')
rng = np.random.RandomState(13)
# (2, 2)행렬, (2, 200) 정규분포 행렬의 곱
X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T
X.shape
plt.scatter(X[:, 0], X[:, 1])
plt.axis('equal')
plt.show()
from sklearn.decomposition import PCA
# n_components : 표현할 주성분의 수
pca = PCA(n_components=2, random_state = 13)
pca.fit(X)
# 벡터 값
pca.components_
# 분산값(설명력)
pca.explained_variance_
# 평균값(일종의 좌표 원점값)
pca.mean_
def draw_vector(v0, v1, ax=None):
ax = ax or plt.gca()
arrowprops = dict(arrowstyle='->',
linewidth=2, color='black',
shrinkA=0, shrinkB=0)
ax.annotate('', v1, v0, arrowprops=arrowprops)
plt.scatter(X[:, 0], X[:,1], alpha=0.4)
for length, vector in zip(pca.explained_variance_, pca.components_):
v = vector * 3 * np.sqrt(length)
draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal')
plt.show()
pca = PCA(n_components=1, random_state = 13)
pca.fit(X)
X_pca = pca.transform(X)
print(pca.components_)
print(pca.explained_variance_)
print(pca.mean_)
# 변환한 값을 원래 값에 매칭시킬 수 있다.
X_new = pca.inverse_transform(X_pca)
plt.scatter(X[:, 0], X[:, 1], alpha=0.3)
plt.scatter(X_new[:, 0], X_new[:, 1], alpha=0.9)
plt.axis('equal')
plt.show()
wine = pd.read_csv('../data/02/wine.csv', sep=',', index_col=0)
wine.head()
wine_y = wine['color']
wine_X = wine.drop(['color'], axis=1)
wine_X.head()
wine_ss = StandardScaler().fit_transform(wine_X)
wine_ss[:3]
wine_ss.shape
def print_variance_ratio(pca):
print('variance_ratio : {}'.format(pca.explained_variance_ratio_))
print('sum of variance_ratio : {}'.format(np.sum(pca.explained_variance_ratio_)))
pca_wine, pca = get_pca_data(wine_ss, n_components=2)
pca_wine.shape
print_variance_ratio(pca)
pca_columns = ['pca_component1', 'pca_component2']
pca_wine_pd = pd.DataFrame(pca_wine, columns=pca_columns)
pca_wine_pd['color'] = wine_y.values
sns.pairplot(pca_wine_pd, hue='color', height=5,
x_vars=['pca_component1'], y_vars=['pca_component2']);
# 원 데이터
rf_scores(wine_ss, wine_y)
# 주성분 2개
pca_X = pca_wine_pd[['pca_component1', 'pca_component2']]
rf_scores(pca_X, wine_y)
pca_wine, pca = get_pca_data(wine_ss, n_components=3)
print_variance_ratio(pca)
pca_cols = ['pca_1', 'pca_2', 'pca_3']
pca_wine_pd = pd.DataFrame(pca_wine, columns=pca_cols)
# 주성분 3개
pca_X = pca_wine_pd[pca_cols]
rf_scores(pca_X, wine_y)
# 결과 정리
pca_wine_plot = pca_X
pca_wine_plot['color'] = wine_y.values
pca_wine_plot.head()
from mpl_toolkits.mplot3d import Axes3D
markers=['^', 'o']
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
for i, marker in enumerate(markers):
x_axis_data = pca_wine_plot[pca_wine_plot['color']==i]['pca_1']
y_axis_data = pca_wine_plot[pca_wine_plot['color']==i]['pca_2']
z_axis_data = pca_wine_plot[pca_wine_plot['color']==i]['pca_3']
ax.scatter(x_axis_data, y_axis_data, z_axis_data,
s = 20, alpha=0.5)
ax.view_init(30, 80)
plt.show()
import plotly_express as px
fig = px.scatter_3d(pca_wine_plot,
x = 'pca_1', y='pca_2', z='pca_3',
color='color', symbol='color',
opacity=0.4)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()