python์ ๋ํํ๋ ๋จธ์ ๋ฌ๋ ๋ผ์ด๋ธ๋ฌ๋ฆฌ
๋งค์ฐ ๋ค์ํ ์ ์ฒ๋ฆฌ ๋๊ตฌ์ ์๊ณ ๋ฆฌ์ฆ์ ์ ๊ณตํ๊ณ ์์ด ๋จธ์ ๋ฌ๋ ๊ธฐ๋ฒ์ ๋ฐฐ์ฐ๋ ๋ฐ ์ ํฉ
๋ฐ์ดํฐ ๋ถ์์ ์ํ ๊ฐ๋จํ๊ณ ํจ์จ์ ์ธ ๋๊ตฌ๋ฅผ ์ ๊ณต
NumPy, Pandas, SciPy ๋ฐ matplotlib๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ๊ตฌ์ถ๋์ด ์์ด ๋ค๋ฅธ ํ์ด์ฌ ํจํค์ง์ ํจ๊ป ์ฌ์ฉํ๊ธฐ ์ฉ์ด
๋จ์ ์
๋ฅ๋ฌ๋, ๊ฐํํ์ต, ์๊ณ์ด ๋ชจํ์ ๋งค์ฐ ์ฝํจ
์ต๊ทผ ๊ฐ๋ฐ๋ ๋์ฉ๋์ ์ํ ๋ฐ์ดํฐํ๋ ์์ธ Polars์ ๊ฐ์ ๋ผ์ด๋ธ๋ฌ๋ฆฌ์๋ ์ฐ๋์ด ์ ์๋จ
์ฃผ์ ๊ธฐ๋ฅ

๐ ๊ธฐ์ ํจ์์ ์ปค๋
1) ๊ธฐ์ ํจ์
: ๋ฐ์ดํฐ๊ฐ ๋น์ ํ์ด๋ฉด ์ ํํ๊ท๋ชจํ์ ์ ํฉํ์ง ์๋ค. ๋ฐ์ดํฐ์ ๋ง๋ ๋น์ ํ ๋ชจํ์ ๋ง๋ค๊ธฐ ์ํด ๋ฐ์ดํฐ์ ์ ํฉํ ๋น์ ํ ํจ์๋ฅผ ์๊ฐํด ๋ผ ์ ์์ด์ผ ํ๋ค. ์ด๋ฅผ ์ํด ๋ง๋ค์ด์ง ๊ฒ์ด ๊ธฐ์ ํจ์(basis function)๋ชจํ์ด๋ค.
1. ๋คํญ ๊ธฐ์ ํจ์(polynomial basis function)
: global function์ผ๋ก, ํ๋์ region์ด๋ผ์ ๋ฐ์ดํฐ ํ๋์ ๋ณ๊ฒฝ์ด ์ ์ฒด region์ ์ํฅ์ ๋ฏธ์น๋ค.
ํ ์คํธ๋ฅผ ์ํ ์์ํจ์
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
num = 100
X = np.linspace(-1, 1, num).reshape(100, 1)
Y = np.sin(2*np.pi*X)
plt.plot(X, Y, 'g-')

num = 100
i = 9
x = np.linspace(-1, 1, num).reshape(100, 1)
M = 9
for i in range(M+1):
y = X**i
plt.plot(X, y)
plt.title('polynomial curve fitting')


num = 100
i = 9
X = np.linspace(0,1,num).reshape(100,1)
M = 9
for interval in range(2, M+1):
for j in range(interval):
y = np.exp(-(X - j / (interval - 1)) ** 2 / (2 * 0.1 ** 2))
plt.plot(X, y)
plt.title("Radial basis function")


2) ์ปค๋(Kernel)
: ๋ฐ์ดํฐ๋ฅผ ๊ณ ์ฐจ์์ผ๋ก ๋ณด๋ด ์ํฌํธ ๋ฒกํฐ๋ฅผ ๊ตฌํ๊ณ ์ ์ฐจ์์ผ๋ก ์ถ์ํ๋ ๊ณผ์ ์ ๋ณต์กํ๊ณ ๋ง์ ์ฐ์ฐ๋์ ํ์๋ก ํ๊ธฐ ๋๋ฌธ์ Kernel Trick์ ์ฌ์ฉํ๋ค.

import pandas as pd
import seaborn as sns
#์์์ 4์งธ์๋ฆฌ ์ดํ์ ๋ฐ์ฌ๋ฆผ
pd.set_option("display.float_format", lambda x: f'{x:.4f}')
#iris ๋ฐ์ดํฐ ๋ก๋
iris = sns.load_dataset('iris')
#iris์ ์์ฐจํ ๋ณ์๋ง ์ถ์ถ
iris = iris.select_dtypes(exclude = 'object')
#iris์ ๊ธฐ์ ํต๊ณ๋ ํ์ธ
iris.describe()
#sepal_length์ petal_length์ joinplot๊ทธ๋ฆผ
sns.jointplot(data = iris, x = 'petal_length', y= 'petal_width', kind = 'reg')

from sklearn.preprocessing import StandardScaler, RobustScaler
#scaler๊ฐ์ฒด ์์ฑ
standard_scaler = StandardScaler()
robust_scaler = RobustScaler()
#๋ฐ์ดํฐ ๋ณํ
iris_standard = pd.DataFrame(standard_scaler.fit_transform(iris), columns = iris.columns)
iris_robust = pd.DataFrame(robust_scaler.fit_transform(iris), columns = iris.columns)
#๊ฒฐ๊ณผ ์ถ๋ ฅ
print("Standard Scaled : \n", iris_standard.describe())
print()
print("Robust Scaled : \n", iris_robust.describe())
Standard Scaled :
sepal_length sepal_width petal_length petal_width
count 150.0000 150.0000 150.0000 150.0000
mean -0.0000 -0.0000 -0.0000 -0.0000
std 1.0034 1.0034 1.0034 1.0034
min -1.8700 -2.4339 -1.5676 -1.4471
25% -0.9007 -0.5924 -1.2266 -1.1838
50% -0.0525 -0.1320 0.3365 0.1325
75% 0.6745 0.5586 0.7628 0.7907
max 2.4920 3.0908 1.7858 1.7121
Robust Scaled :
sepal_length sepal_width petal_length petal_width
count 150.0000 150.0000 150.0000 150.0000
mean 0.0333 0.1147 -0.1691 -0.0671
std 0.6370 0.8717 0.5044 0.5082
min -1.1538 -2.0000 -0.9571 -0.8000
25% -0.5385 -0.4000 -0.7857 -0.6667
50% 0.0000 0.0000 0.0000 0.0000
75% 0.4615 0.6000 0.2143 0.3333
max 1.6154 2.8000 0.7286 0.8000
#pip install patchworklib
import seaborn as sns
import patchworklib as pw
pw.overwrite_axisgrid()
g1 = sns.jointplot(data = iris_standard, x = "petal_length", y = "petal_width", kind = "reg")
g1 = pw.load_seaborngrid(g1)
g1.set_suptitle("Standard Scaled")
g2 = sns.jointplot(data = iris_robust, x = "petal_length", y = "petal_width", kind = "reg")
g2 = pw.load_seaborngrid(g2)
g2.set_suptitle("Robust Scaled")
g3 = (g1|g2)
g3

MinMaxScaler() : ๋ฒ์๊ฐ [0,1]์ด ๋๋๋ก ์ค์ผ์ผ๋ง
MaxAbsScaler() : ์์๋ [0,1], ์์๋ [-1,0], ์์์๋ [-1,1]์ด ๋๋๋ก ์ค์ผ์ผ๋ง
์ ๊ทํ ํ์ด์ฌ ์์
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
#scaler ๊ฐ์ฒด ์์ฑ
minmax_scaler = MinMaxScaler()
maxabs_scaler = MaxAbsScaler()
#๋ฐ์ดํฐ ๋ณํ
iris_minmax = pd.DataFrame(minmax_scaler.fit_transform(iris), columns=iris.columns)
iris_maxabs = pd.DataFrame(maxabs_scaler.fit_transform(iris), columns=iris.columns)
#๊ฒฐ๊ณผ ์ถ๋ ฅ
print("MinMax Scaled : \n", iris_minmax.describe())
print()
print("MaxAbs Scaled : \n", iris_maxabs.describe())
MinMax Scaled :
sepal_length sepal_width petal_length petal_width
count 150.0000 150.0000 150.0000 150.0000
mean 0.4287 0.4406 0.4675 0.4581
std 0.2300 0.1816 0.2992 0.3176
min 0.0000 0.0000 0.0000 0.0000
25% 0.2222 0.3333 0.1017 0.0833
50% 0.4167 0.4167 0.5678 0.5000
75% 0.5833 0.5417 0.6949 0.7083
max 1.0000 1.0000 1.0000 1.0000
MaxAbs Scaled :
sepal_length sepal_width petal_length petal_width
count 150.0000 150.0000 150.0000 150.0000
mean 0.7397 0.6948 0.5446 0.4797
std 0.1048 0.0991 0.2558 0.3049
min 0.5443 0.4545 0.1449 0.0400
25% 0.6456 0.6364 0.2319 0.1200
50% 0.7342 0.6818 0.6304 0.5200
75% 0.8101 0.7500 0.7391 0.7200
max 1.0000 1.0000 1.0000 1.0000
g1 = sns.jointplot(data = iris_standard, x = "petal_length", y = "petal_width", kind = "reg")
g1 = pw.load_seaborngrid(g1)
g1.set_suptitle("Standard Scaled")
g2 = sns.jointplot(data = iris_robust, x = "petal_length", y = "petal_width", kind = "reg")
g2 = pw.load_seaborngrid(g2)
g2.set_suptitle("Robust Scaled")
g3 = (g1|g2)
g3

5) ๋ณํ(Transformation)
import numpy as np
from sklearn.preprocessing import PowerTransformer, Normalizer
#Scaler ๊ฐ์ฒด ์์ฑ
power_scaler = PowerTransformer()
normal_scaler = Normalizer()
#๋ฐ์ดํฐ ๋ณํ
iris_pow = pd.DataFrame(power_scaler.fit_transform(iris), columns=iris.columns)
iris_norm = pd.DataFrame(normal_scaler.fit_transform(iris), columns=iris.columns)
#๊ฒฐ๊ณผ ์ถ๋ ฅ
print("Power Scaled : \n", iris_pow.describe())
print()
print("Normalizer Scaled : \n", iris_norm.describe())
#๊ฐ ํ์ ๋ฒกํฐ ํฌ๊ธฐ๊ฐ 1์ด ๋๋์ง ํ์ธ
print("Eucledian Distance from 0 : \n", np.linalg.norm(iris_norm, axis = 1))
Power Scaled :
sepal_length sepal_width petal_length petal_width
count 150.0000 150.0000 150.0000 150.0000
mean 0.0000 -0.0000 -0.0000 0.0000
std 1.0034 1.0034 1.0034 1.0034
min -2.1378 -2.7591 -1.5456 -1.4768
25% -0.8957 -0.5615 -1.2244 -1.1896
50% 0.0264 -0.0819 0.3226 0.1597
75% 0.7222 0.5959 0.7598 0.7965
max 2.1770 2.7432 1.8288 1.6585
Normalizer Scaled :
sepal_length sepal_width petal_length petal_width
count 150.0000 150.0000 150.0000 150.0000
mean 0.7514 0.4052 0.4548 0.1411
std 0.0444 0.1056 0.1600 0.0780
min 0.6539 0.2384 0.1678 0.0147
25% 0.7153 0.3267 0.2509 0.0487
50% 0.7549 0.3544 0.5364 0.1641
75% 0.7869 0.5276 0.5800 0.1975
max 0.8609 0.6071 0.6370 0.2804
Eucledian Distance from 0 :
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
1. 1. 1. 1. 1. 1.]
g7 = sns.jointplot(data = iris_pow, x = "petal_length", y = "petal_width", kind = "reg")
g7 = pw.load_seaborngrid(g7)
g7.set_suptitle("PowerTransformer Scaled")
g8 = sns.jointplot(data = iris_norm, x = "petal_length", y = "petal_width", kind = "reg")
g8 = pw.load_seaborngrid(g8)
g8.set_suptitle("Normalizer Scaled")
g9 = (g7|g8)
g9

from sklearn.preprocessing import QuantileTransformer
#scaler๊ฐ์ฒด ์์ฑ
gaussian_scaler = QuantileTransformer(output_distribution = 'normal')
uniform_scaler = QuantileTransformer(output_distribution = 'uniform')
#๋ฐ์ดํฐ ๋ณํ
iris_gaussian = pd.DataFrame(gaussian_scaler.fit_transform(iris), columns = iris.columns)
iris_uniform = pd.DataFrame(uniform_scaler.fit_transform(iris), columns = iris.columns)
#๊ฒฐ๊ณผ ์ถ๋ ฅ
print("QuantileTransformer_Gaussian Scaled : \n", iris_gaussian.describe())
print()
print("QuantileTransformer_Uniform Scaled : \n", iris_uniform.describe())
QuantileTransformer_Gaussian Scaled :
sepal_length sepal_width petal_length petal_width
count 150.0000 150.0000 150.0000 150.0000
mean -0.0012 0.0014 0.0021 -0.0339
std 1.1311 1.1328 1.1331 1.4616
min -5.1993 -5.1993 -5.1993 -5.1993
25% -0.7011 -0.6175 -0.6175 -0.6798
50% 0.0252 -0.0842 0.0084 -0.0589
75% 0.6587 0.6277 0.6692 0.6277
max 5.1993 5.1993 5.1993 5.1993
QuantileTransformer_Uniform Scaled :
sepal_length sepal_width petal_length petal_width
count 150.0000 150.0000 150.0000 150.0000
mean 0.5002 0.5002 0.5004 0.5001
std 0.2914 0.2900 0.2914 0.2912
min 0.0000 0.0000 0.0000 0.0000
25% 0.2416 0.2685 0.2685 0.2483
50% 0.5101 0.4664 0.5034 0.4765
75% 0.7450 0.7349 0.7483 0.7349
max 1.0000 1.0000 1.0000 1.0000
g10 = sns.jointplot(data = iris_gaussian, x = "petal_length", y = "petal_width", kind = "reg")
g10 = pw.load_seaborngrid(g10)
g10.set_suptitle("QuantileTransformer_Gaussian Scaled")
g11 = sns.jointplot(data = iris_uniform, x = "petal_length", y = "petal_width", kind = "reg")
g11 = pw.load_seaborngrid(g11)
g11.set_suptitle("QuantileTransformer_Uniform Scaled")
g12 = (g10|g11)
g12

(g1|g2|g4|g5)/(g7|g8|g10|g11)
