import pandas as pd
from sklearn.datasets import load_wine
wine_load = load_wine()
wine = pd.DataFrame(wine_load.data, columns=wine_load.feature_names)
wine.head()
wine['Class'] = wine_load.target
wine_dummy = pd.get_dummies(wine, columns=['Class'])
wine_dummy.head()
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.datasets import load_iris
iris_load = load_iris()
iris = pd.DataFrame(iris_load.data, columns=iris_load.feature_names)
iris['Class'] = iris_load.target
iris['Class'] = iris['Class'].map({
0 : 'Setosa',
1 : 'Versicolour',
2 : 'Virginica'
})
x_train, x_test, y_train, y_test = train_test_split(
iris.drop(columns='Class'),
iris['Class'],
test_size=0.3,
random_state=1000
)
print('x_train :', x_train.shape, 'x_test :', x_test.shape)
print('y_train :', y_train.shape, 'y_test :', y_test.shape)
x_train, x_test, y_train, y_test = train_test_split(
iris.drop(columns='Class'),
iris['Class'],
test_size = 0.3,
stratify = iris['Class']
)
대부분의 분석 알고리즘은 컬럼 간 데이터의 범위가 크게 차이 날 경우 잘 동작하지 않는다.
값의 범위가 작은 컬럼에 비해서 값의 범위가 큰 컬럼이 타깃 변수를 예측하는데 큰 영향을 준다고 판단
스케일링 작업은 모든 컬럼의 값의 범위를 같게 만들어주는 작업이다.
데이터 스케일링 순서
from sklearn.preprocessing import StandardScaler
StdScaler = StandardScaler()
# Train 데이터의 fitting과 스케일링
StdScaler.fit(x_train)
X_train_sc = StdScaler.transform(x_train)
# Test 데이터의 스케일링
X_test_sc = StdScaler.transform(x_test)
print("\t\t(min, max) (mean, std)")
print("Train_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(), X_train_sc.std()))
print("Test_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(), X_test_sc.std()))
from sklearn.preprocessing import MinMaxScaler
MmScaler = MinMaxScaler()
# Train 데이터의 fitting과 스케일링
MmScaler.fit(x_train)
X_train_sc = MmScaler.transform(x_train)
# Test 데이터의 스케일링
X_test_sc = MmScaler.transform(x_test)
print("\t\t(min, max) (mean, std)")
print("Train_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(), X_train_sc.std()))
print("Test_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(), X_test_sc.std()))
from sklearn.preprocessing import MaxAbsScaler
MaScaler = MaxAbsScaler()
# Train 데이터의 fitting과 스케일링
MaScaler.fit(x_train)
X_train_sc = MaScaler.transform(x_train)
# Test 데이터의 스케일링
X_test_sc = MaScaler.transform(x_test)
print("\t\t(min, max) (mean, std)")
print("Train_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(), X_train_sc.std()))
print("Test_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(), X_test_sc.std()))
from sklearn.preprocessing import RobustScaler
RuScaler = RobustScaler()
# Train 데이터의 fitting과 스케일링
RuScaler.fit(x_train)
X_train_sc = RuScaler.transform(x_train)
# Test 데이터의 스케일링
X_test_sc = RuScaler.transform(x_test)
print("\t\t(min, max) (mean, std)")
print("Train_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(), X_train_sc.std()))
print("Test_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(), X_test_sc.std()))
X_Original = RuScaler.inverse_transform(X_train_sc)
pd.DataFrame(X_Original).head(3)