데이터를 변환시켰을 때, 어떤 백터를 선정하면 본래 데이터 구조를 가장 잘 유지할 수 있을까
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
rng = np.random.RandomState(13)
X = np.dot(rng.rand(2,2), rng.randn(2,200)).T
X.shape
plt.scatter(X[:,0], X[:,1])
plt.axis('equal')
from sklearn.decomposition import PCA
pca = PCA(n_components=2, random_state=13)
pca.fit(X)
pca.components_
pca.explained_variance_
pca.explained_variance_ratio_
pca.mean_
def draw_vector(v0, v1, ax=None):
ax = ax or plt.gca()
arrowprops = dict(arrowstyle="->",
linewidth=2, color="black",
shrinkA=0, shrinkB=0)
ax.annotate('',v1, v0, arrowprops=arrowprops)
plt.scatter(X[:,0], X[:,1], alpha=0.4)
for length, vector in zip(pca.explained_variance_, pca.components_):
v = vector * 3 * np.sqrt(length)
draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal')
plt.show()
pca = PCA(n_components=1, random_state=13)
pca.fit(X)
pca.components_
pca.mean_
pca.explained_variance_ratio_
X_pca = pca.transform(X)
X_pca
X_new = pca.inverse_transform(X_pca)
plt.scatter(X[:,0], X[:,1], alpha=0.3)
plt.scatter(X_new[:,0], X_new[:,1], alpha=0.9)
plt.axis('equal')
plt.show()
linear regression과 같은 결과 잎수도 있다.
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
iris_pd = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_pd['species'] = iris.target
iris_pd.head()
import seaborn as sns
sns.pairplot(iris_pd, hue='species', height=3,
x_vars=['sepal length (cm)', 'petal width (cm)'],
y_vars=['petal length (cm)', 'sepal width (cm)']);
from sklearn.preprocessing import StandardScaler
iris_ss = StandardScaler().fit_transform(iris.data)
iris_ss[:3]
from sklearn.decomposition import PCA
def get_pca_data(ss_data, n_components=2):
pca = PCA(n_components=n_components)
pca.fit(ss_data)
return pca.transform(ss_data), pca
iris_pca, pca = get_pca_data(iris_ss, 2)
iris_pca.shape
pca.mean_
pca.components_
pca.explained_variance_ratio_
def get_pd_from_pca(pca_data, cols=['PC1', 'PC2']):
return pd.DataFrame(pca_data, columns=cols)
iris_pd_pca = get_pd_from_pca(iris_pca)
iris_pd_pca['species'] = iris.target
iris_pd_pca.head()
sns.pairplot(iris_pd_pca, hue='species',height=5, x_vars=['PC1'], y_vars=['PC2']);
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
def rf_scores(X, y, cv=5):
rf = RandomForestClassifier(random_state=13, n_estimators=100)
scores_rf = cross_val_score(rf, X, y, scoring='accuracy', cv=cv)
print('Score :', np.mean(scores_rf))
%%time
rf_scores(iris_ss, iris.target)
pca_X = iris_pd_pca[['PC1', 'PC2']]
rf_scores(pca_X, iris.target) # 의미 없음
import pandas as pd
wine_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv'
wine = pd.read_csv(wine_url, index_col=0)
wine.head()
wine_X = wine.drop(['color'], axis=1)
wine_y = wine['color']
wine_ss = StandardScaler().fit_transform(wine_X)
def print_variance_ratio(pca):
print('varince_ratio :', pca.explained_variance_ratio_)
print('sum of varince_ratio :', np.sum(pca.explained_variance_ratio_))
pca_wine, pca = get_pca_data(wine_ss, n_components=2)
print_variance_ratio(pca)
pca_colums = ['PC1','PC2']
pca_wine_pd = pd.DataFrame(pca_wine, columns=pca_colums)
pca_wine_pd['color'] = wine_y.values
sns.pairplot(pca_wine_pd, hue='color', height=5, x_vars=['PC1'], y_vars=['PC2']);
%%time
rf_scores(wine_ss, wine_y)
%%time
pca_X = pca_wine_pd[['PC1','PC2']]
rf_scores(pca_X, wine_y)
pca_wine, pca = get_pca_data(wine_ss, n_components=3)
print_variance_ratio(pca)
pca_colums = ['PC1','PC2','PC3']
pca_wine_pd = pd.DataFrame(pca_wine, columns=pca_colums)
pca_wine_pd['color'] = wine_y.values
pca_X = pca_wine_pd[pca_colums]
rf_scores(pca_X, wine_y)
pca_wine_plot = pca_X
pca_wine_plot['color'] = wine_y.values
pca_wine_plot.head()
import plotly.express as px
fig = px.scatter_3d(pca_wine_plot, x='PC1', y='PC2', z='PC3', color='color', symbol='color', opacity=0.4)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()