import pandas as pd
df = pd.DataFrame({'weight' : [40, 80, 60, 50, 90],
'height' : [162, 155, 182, 173, 177],
'sex' : ['f', 'm', 'm', 'f', 'm'],
'blood_type' : ['O', 'A', 'B', 'O', 'A'],
'health' : ['good', 'excellent', 'bad', 'bad', 'good'],
})
df
weight height sex blood_type health
0 40 162 f O good
1 80 155 m A excellent
2 60 182 m B bad
3 50 173 f O bad
4 90 177 m A good
from sklearn.preprocessing import OrdinalEncoder
#데이터프레임 복사
df_oe = df.copy()
#OrdinalEncoder 에 대한 객체 생성
oe = OrdinalEncoder()
#데에터로 OrdinalEncoder 학습
oe.fit(df)
print(f'{oe.categories_ = }')
df_oe = pd.DataFrame(oe.transform(df), columns = df.columns)
df_oe
oe.categories_ = [array([40, 50, 60, 80, 90]), array([155, 162, 173, 177, 182]), array(['f', 'm'], dtype=object), array(['A', 'B', 'O'], dtype=object), array(['bad', 'excellent', 'good'], dtype=object)]
weight height sex blood_type health
0 0.0 1.0 0.0 2.0 2.0
1 3.0 0.0 1.0 0.0 1.0
2 2.0 4.0 1.0 1.0 0.0
3 1.0 2.0 0.0 2.0 0.0
4 4.0 3.0 1.0 0.0 2.0
df_oe = df.copy()
oe = OrdinalEncoder()
oe.fit(df[['sex', 'blood_type']])
print(f'{oe.categories_ = }')
df_oe.iloc[:,2:4] = oe.transform(df[['sex', 'blood_type']])
df_oe
oe.categories_ = [array(['f', 'm'], dtype=object), array(['A', 'B', 'O'], dtype=object)]
weight height sex blood_type health
0 40 162 0.0 2.0 good
1 80 155 1.0 0.0 excellent
2 60 182 1.0 1.0 bad
3 50 173 0.0 2.0 bad
4 90 177 1.0 0.0 good
oe.inverse_transform(df_oe.iloc[:, 2:4])
array([['f', 'O'],
['m', 'A'],
['m', 'B'],
['f', 'O'],
['m', 'A']], dtype=object)
from sklearn.preprocessing import LabelEncoder
df_le = df.copy()
le = LabelEncoder()
#LabelEncoder는 하나의 변수에 대해서만 변환 가능
#객체 생성과 fit을 동시에 적용
health_le = le.fit(df.health)
df_le['health'] = health_le.transform(df.health)
df_le
weight height sex blood_type health
0 40 162 f O 2
1 80 155 m A 1
2 60 182 m B 0
3 50 173 f O 0
4 90 177 m A 2
from sklearn.preprocessing import OneHotEncoder
#데이터프레임 복사
df_one = df.copy()
#OneGot Encoder에 대한 객체 생성
oneHot = OneHotEncoder().fit(df_one[['blood_type']])
print(f'{oneHot.categories_=}')
#학습된 결과 적용하여 새로운 컬럼 삽입
#OneHot Encoder는 결과를 sparse matrix로 반환하므로 toarray()를 통해 ndarray로 변환
df_one[oneHot.categories_[0]] = oneHot.transform(df_one[['blood_type']]).toarray()
df_one

pd.get_dummies(df, columns = ['sex', 'blood_type'], drop_first = False)

from sklearn.preprocessing import Binarizer
df_bin = df.copy()
#Binarizer 객체 생성과 fit, transform을 동시에 적용
#Binarizer는 수치형 변수에 대해서만 변환 가능
df_bin['weight_bin'] = Binarizer(threshold = 50).fit_transform(df.weight.values.reshape(-1,1))
df_bin['height_bin'] = Binarizer(threshold = 170).fit_transform(df.height.values.reshape(-1,1))
df_bin

from sklearn.preprocessing import LabelBinarizer
df_lb = df.copy()
lb = LabelBinarizer().fit(df.health)
print(f'{lb.classes_=}')
#lb.transform() : 인코딩 변환
health_lb = lb.transform(df.health)
print('health_lb = \n', health_lb)
#인코딩한 데이터를 df로 변환
df_lb[lb.classes_] = health_lb
df_lb
health_lb =
[[0 0 1]
[0 1 0]
[1 0 0]
[1 0 0]
[0 0 1]]

from sklearn.preprocessing import MultiLabelBinarizer
df_mlb = df.copy()
df_mlb['test'] = [['math', 'english'], ['math', 'science'], ['science'],
['math', 'english'], ['science']]
df_mlb

mlb = MultiLabelBinarizer().fit(df_mlb.test)
#classes_ 속성을 이용해 어떤 클래스가 인코딩 되었는지 확인 가능
print(f'{mlb.classes_=}')
df_mlb[mlb.classes_] = mlb.transform(df_mlb.test)
df_mlb

: 연속형 변수를 구간별로 나누어 범주형 변수로 변환
: quantization(양자화), binning
from sklearn.preprocessing import KBinsDiscretizer
df_kbd = df.copy()
kbd = KBinsDiscretizer(n_bins = 3, encode = 'ordinal').fit(df[['weight', 'height']])
#kbd.transformer() : 인코딩 변환
df_kbd[['weight_bin', 'height_bin']] = kbd.transform(df[['weight', 'height']])
df_kbd
