필요한 라이브러리 호출
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
데이터 로드
x, _ = make_blobs(
n_samples=200, centers=5,
random_state=0, cluster_std=1.5
)
train, test = train_test_split(x, random_state=5, test_size=.1)
train.shape, test.shape
원본 데이터와 스케일링 된 데이터 간의 비교를 확인하기 위한 함수
def compare_scaler(train_scaled, test_scaled, scaler):
fig, ax = plt.subplots(1, 3, figsize=[20, 5])
ax[0].scatter(train[:, 0], train[:, 1], c='b', label='train data set')
ax[0].scatter(test[:, 0], test[:, 1], c='r', label='test data set')
ax[0].set_title('ORIGINAL')
ax[1].scatter(train_scaled[:, 0], train_scaled[:, 1], c='b', label='train data set')
ax[1].scatter(test_scaled[:, 0], test_scaled[:, 1], c='r', label='test data set')
ax[1].set_title(scaler)
ax[2].scatter(train[:, 0], train[:, 1], c='#B5B2FF', label='original train data set')
ax[2].scatter(test[:, 0], test[:, 1], c='#FFA7A7', label='original test data set')
ax[2].scatter(train_scaled[:, 0], train_scaled[:, 1], s=10, c='#4641D9', label='scaled train data set')
ax[2].scatter(test_scaled[:, 0], test_scaled[:, 1], s=10, c='#CC3D3D', label='scaled test data set')
plt.legend()
plt.show()
from sklearn.preprocessing import PowerTransformer
scaler = PowerTransformer()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)
compare_scaler(train_scaled, test_scaled, 'PowerTransformer')
output_distribution='normal'
사용)from sklearn.preprocessing import QuantileTransformer
# n_quantiles 디폴트 = 1000
scaler = QuantileTransformer(n_quantiles=train.shape[0])
# scaler = QuantileTransformer(n_quantiles=x_train.shape[0], output_distribution='normal')
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)
compare_scaler(train_scaled, test_scaled, 'QuantileTransformer')
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)
compare_scaler(train_scaled, test_scaled, 'MinMaxScaler')
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)
compare_scaler(train_scaled, test_scaled, 'StandardScaler')
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)
compare_scaler(train_scaled, test_scaled, 'RobustScaler')
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
cancer_df = pd.DataFrame(
data=cancer.data, columns=cancer.feature_names
)
cancer_df['target'] = cancer.target
cancer_df.head()
x_train, x_test, y_train, y_test = train_test_split(
cancer.data, cancer.target,
test_size=0.2, random_state=3
)
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
no_scaler_score = clf.score(x_test, y_test)
print('No Scaler 모델 정확도 :', no_scaler_score)
No Scaler 모델 정확도 : 0.8859649122807017
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(x_train)
test_scaled = scaler.transform(x_test)
clf = DecisionTreeClassifier()
clf.fit(train_scaled, y_train)
scaler_score = clf.score(test_scaled, y_test)
print('MinMaxScaler 모델 정확도 :', scaler_score)
MinMaxScaler 모델 정확도 : 0.9122807017543859
scaler.fit_transform()
fit()
: 데이터 학습 함수transform()
: 학습한 것을 적용하여 변환하는 함수fit()
+ transform()