Data scaling is the process of transforming the values of the features of a dataset till they are within a specific range, e.g. 0 to 1 or -1 to 1. This is to ensure that no single feature dominates the distance calculations in an algorithm, and can help to improve the performance of the algorithm. There are several methods of scaling data, including:
from sklearn.preprocessing import StandardScaler
# instance of the StandardScaler
sc = StandardScaler()
# fit the scaler to the data
sc.fit(data)
# transform the data using the scaler
data_scaled = sc.transform(data)
from sklearn.preprocessing import MinMaxScaler
# instance of the MinMaxScaler
sc = MinMaxScaler()
# fit the scaler to the data
sc.fit(data)
# transform the data using the scaler
data_scaled = sc.transform(data)
Robust
Standard
Min Max
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
cancer.keys()
cancer['feature_names']
import pandas as pd
#문제데이터
x = pd.DataFrame(cancer['data'], columns = cancer ['feature_names'])
x.head()
#정답데이터
y = pd.DataFrame(cancer['target'],columns = ['cancer'])
y.head()
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, Normalizer
#1. scaling model 불러오기
rbs = RobustScaler()
#2. 가지고 있는 데이터로 스케일링 모델에 학습 > 어떤값이 어떻게 변하는지 파악
rbs.fit(x)
# 3. 파악된 규칙을 통해서 값을 변형
x_rbs = rbs.transform(x)
#1. scaling model 불러오기
sds = StandardScaler()
#2. 가지고 있는 데이터로 스케일링 모델에 학습 > 어떤값이 어떻게 변하는지 파악
sds.fit(x)
# 3. 파악된 규칙을 통해서 값을 변형
x_sds = sds.transform(x)
#1. scaling model 불러오기
mms = MinMaxScaler()
#2. 가지고 있는 데이터로 스케일링 모델에 학습 > 어떤값이 어떻게 변하는지 파악
mms.fit(x)
# 3. 파악된 규칙을 통해서 값을 변형
x_mms = mms.transform(x)
# train과 test로 데이터 분리
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_rbs,y)
# train과 test로 데이터 분리
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_sds,y)
# train과 test로 데이터 분리
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_mms,y)
# knn model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train,y_train)
# 경고탕 지우기
import warnings
warnings.filterwarnings(action = 'ignore')
from sklearn.model_selection import cross_val_score
cross_val_score(knn, x_train, y_train, cv = 5).mean()