bream_length = [25.4, 26.3, 26.5, 29.0, 29.0, 29.7, 29.7, 30.0, 30.0, 30.7, 31.0, 31.0, 31.5, 32.0, 32.0, 32.0, 33.0, 33.0, 33.5, 33.5, 34.0, 34.0, 34.5, 35.0, 35.0, 35.0, 35.0, 36.0, 36.0, 37.0, 38.5, 38.5, 39.5, 41.0, 41.0]
bream_weight = [242.0, 290.0, 340.0, 363.0, 430.0, 450.0, 500.0, 390.0, 450.0, 500.0, 475.0, 500.0, 500.0, 340.0, 600.0, 600.0, 700.0, 700.0, 610.0, 650.0, 575.0, 685.0, 620.0, 680.0, 700.0, 725.0, 720.0, 714.0, 850.0, 1000.0, 920.0, 955.0, 925.0, 975.0, 950.0]
도미의 분포를 시각화해보자
import matplotlib.pyplot as plt
plt.style.use(['seaborn'])
plt.scatter(bream_length, bream_weight)
plt.xlabel('length')
plt.ylabel('weight')
plt.show()
smelt_length = [9.8, 10.5, 10.6, 11.0, 11.2, 11.3, 11.8, 11.8, 12.0, 12.2, 12.4, 13.0, 14.3, 15.0]
smelt_weight = [6.7, 7.5, 7.0, 9.7, 9.8, 8.7, 10.0, 9.9, 9.8, 12.2, 13.4, 12.2, 19.7, 19.9]
plt.scatter(bream_length, bream_weight)
plt.scatter(smelt_length, smelt_weight)
plt.xlabel("length")
plt.show("weight")
도미와 방어를 같이 표현했더니 다음과 같은 그래프가 되었다.
len(smelt_length), len(bream_weight)
(14, 35)
length = bream_length + smelt_length
weight = bream_weight + smelt_weight
fish_data = [[l,w] for l, w in zip(length, weight)]
fish_data
[[25.4, 242.0],
[26.3, 290.0],
[26.5, 340.0],
[29.0, 363.0],
[29.0, 430.0],
[29.7, 450.0],
[29.7, 500.0],
[30.0, 390.0],
[30.0, 450.0],
[30.7, 500.0],
[31.0, 475.0],
[31.0, 500.0],
[31.5, 500.0],
[32.0, 340.0],
[32.0, 600.0],
[32.0, 600.0],
[33.0, 700.0],
[33.0, 700.0],
[33.5, 610.0],
[33.5, 650.0],
[34.0, 575.0],
[34.0, 685.0],
[34.5, 620.0],
[35.0, 680.0],
[35.0, 700.0],
[35.0, 725.0],
[35.0, 720.0],
[36.0, 714.0],
[36.0, 850.0],
[37.0, 1000.0],
[38.5, 920.0],
[38.5, 955.0],
[39.5, 925.0],
[41.0, 975.0],
[41.0, 950.0],
[9.8, 6.7],
[10.5, 7.5],
[10.6, 7.0],
[11.0, 9.7],
[11.2, 9.8],
[11.3, 8.7],
[11.8, 10.0],
[11.8, 9.9],
[12.0, 9.8],
[12.2, 12.2],
[12.4, 13.4],
[13.0, 12.2],
[14.3, 19.7],
[15.0, 19.9]]
fish_target = [1] * 35 + [0] * 14
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier()
kn.fit(fish_data, fish_target)
KNeighborsClassifier()
kn.score(fish_data, fish_target)
1.0
kn.predict([[30,600]])
array([1])
for n in range(5,50):
kn.n_neighbors = n
score = kn.score(fish_data, fish_target)
if score < 1:
print(n,score)
break
18 0.9795918367346939
fish_length = [25.4, 26.3, 26.5, 29.0, 29.0, 29.7, 29.7, 30.0, 30.0, 30.7, 31.0, 31.0,
31.5, 32.0, 32.0, 32.0, 33.0, 33.0, 33.5, 33.5, 34.0, 34.0, 34.5, 35.0,
35.0, 35.0, 35.0, 36.0, 36.0, 37.0, 38.5, 38.5, 39.5, 41.0, 41.0, 9.8,
10.5, 10.6, 11.0, 11.2, 11.3, 11.8, 11.8, 12.0, 12.2, 12.4, 13.0, 14.3, 15.0]
fish_weight = [242.0, 290.0, 340.0, 363.0, 430.0, 450.0, 500.0, 390.0, 450.0, 500.0, 475.0, 500.0,
500.0, 340.0, 600.0, 600.0, 700.0, 700.0, 610.0, 650.0, 575.0, 685.0, 620.0, 680.0,
700.0, 725.0, 720.0, 714.0, 850.0, 1000.0, 920.0, 955.0, 925.0, 975.0, 950.0, 6.7,
7.5, 7.0, 9.7, 9.8, 8.7, 10.0, 9.9, 9.8, 12.2, 13.4, 12.2, 19.7, 19.9]
fish_data = [[l,w] for l, w in zip(fish_length, fish_weight)]
fish_target = [1] * 35 + [0] * 14
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier()
print(fish_data[0:4])
[[25.4, 242.0], [26.3, 290.0], [26.5, 340.0], [29.0, 363.0]]
import numpy as np
input_arr = np.array(fish_data)
target_arr = np.array(fish_target)
np.random.seed(42)
index = np.arange(49)
np.random.shuffle(index)
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(fish_data, fish_target, random_state = 42)
print(train_target.shape, test_target.shape)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-41-b51399f165ba> in <module>
----> 1 print(train_target.shape, test_target.shape)
AttributeError: 'list' object has no attribute 'shape'
kn.fit(train_input, train_target)
kn.score(test_input, test_target)
1.0
print(kn.predict([[25, 150]]))
[0]
import matplotlib.pyplot as plt
plt.style.use(['seaborn'])
plt.scatter(train_input[:,0], train_input[:,1])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-49-5cb02cebfedb> in <module>
1 import matplotlib.pyplot as plt
2 plt.style.use(['seaborn'])
----> 3 plt.scatter(train_input[:,0], train_input[:,1])
TypeError: list indices must be integers or slices, not tuple
perch_length = np.array([8.4, 13.7, 15.0, 16.2, 17.4, 18.0, 18.7, 19.0, 19.6, 20.0, 21.0,
21.0, 21.0, 21.3, 22.0, 22.0, 22.0, 22.0, 22.0, 22.5, 22.5, 22.7,
23.0, 23.5, 24.0, 24.0, 24.6, 25.0, 25.6, 26.5, 27.3, 27.5, 27.5,
27.5, 28.0, 28.7, 30.0, 32.8, 34.5, 35.0, 36.5, 36.0, 37.0, 37.0,
39.0, 39.0, 39.0, 40.0, 40.0, 40.0, 40.0, 42.0, 43.0, 43.0, 43.5,
44.0])
perch_weight = np.array([5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 110.0,
115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 130.0,
150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 197.0,
218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, 514.0,
556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, 820.0,
850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, 1000.0,
1000.0])
import matplotlib.pyplot as plt
plt.scatter(perch_length, perch_weight)
plt.xlabel("length")
plt.ylabel("weight")
plt.show()
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(perch_length, perch_weight, random_state = 42)
## 차원변경
train_input = train_input.reshape(-1, 1)
test_input = test_input.reshape(-1 , 1)
print(train_input.shape, test_input.shape)
(42, 1) (14, 1)
train_input
array([[19.6],
[22. ],
[18.7],
[17.4],
[36. ],
[25. ],
[40. ],
[39. ],
[43. ],
[22. ],
[20. ],
[22. ],
[24. ],
[27.5],
[43. ],
[40. ],
[24. ],
[21. ],
[27.5],
[40. ],
[32.8],
[26.5],
[36.5],
[13.7],
[22.7],
[15. ],
[37. ],
[35. ],
[28.7],
[23.5],
[39. ],
[21. ],
[23. ],
[22. ],
[44. ],
[22.5],
[19. ],
[37. ],
[22. ],
[25.6],
[42. ],
[34.5]])
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()
knr.fit(train_input, train_target)
KNeighborsRegressor()
knr.score(test_input, test_target)
0.992809406101064
## 오류는 얼마나 날까?
from sklearn.metrics import mean_absolute_error
test_prediction = knr.predict(test_input)
mae = mean_absolute_error(test_target, test_prediction)
print(mae)
19.157142857142862
## 과소 적합 문제 해결
knr.n_neighbors = 3
knr.fit(train_input, train_target)
print(knr.score(train_input, train_target))
print(knr.score(test_input, test_target))
knr.predict([[100]])
0.9804899950518966
0.9746459963987609
array([1033.33333333])
distance, indexes = knr.kneighbors([[50]])
plt.scatter(train_input, train_target)
plt.scatter(train_input[indexes], train_target[indexes],marker = 'D')
plt.scatter(50, 1033, marker = '^')
plt.show()
문제가 생겼다. 샘플에서 크기가 제일 크다면, 크기와 상관없이 이웃과 비교하여 같은 무게가 나오게 된다.
이를 선형 근사 또는 선형회귀 모델을 이용해 해결하려고한다.
print(np.mean(train_target[indexes]))
1033.3333333333333
## KNN의 문제를 해결하기 위해 선형모델 등장
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_input, train_target)
print(lr.predict([[50]]))
[1241.83860323]
print(lr.coef_, lr.intercept_)
[39.01714496] -709.0186449535477
plt.scatter(train_input, train_target)
plt.plot([15,50], [15 * lr.coef_ + lr.intercept_, 50* lr.coef_ + lr.intercept_])
plt.scatter(50, 1241.8, marker='^')
<matplotlib.collections.PathCollection at 0x7faec1cf7190>
선형 회귀 모델을 학습시켜서 나타내었더니 생각보다 맞지 않는 데이터가 몇 있었다. 이를 보정해주기 위하여,
계수를 제곱이나, 다항화하여 더욱 더 잘 근사할 수 있는 식을 만들었다.
lr.score(test_input, test_target)
0.8247503123313558
train_poly = np.column_stack((train_input ** 2, train_input))
test_poly = np.column_stack((test_input ** 2, test_input))
print(train_poly.shape, test_poly.shape)
(42, 2) (14, 2)
lr = LinearRegression()
lr.fit(train_poly, train_target)
print(lr.predict([[50**2,50]]))
[1573.98423528]
print(lr.coef_, lr.intercept_)
[ 1.01433211 -21.55792498] 116.0502107827827
plt.scatter(train_input, train_target)
point = np.arange(15,50)
plt.plot(point, 1.01 * point **2 - 21.6 * point + 116.05)
plt.show()
print(lr.score(train_poly, train_target))
print(lr.score(test_poly, test_target))
0.9706807451768623
0.9775935108325122
import pandas as pd
df = pd.read_csv('https://bit.ly/perch_csv_data')
perch_full = df.to_numpy()
print(perch_full)
[[ 8.4 2.11 1.41]
[13.7 3.53 2. ]
[15. 3.82 2.43]
[16.2 4.59 2.63]
[17.4 4.59 2.94]
[18. 5.22 3.32]
[18.7 5.2 3.12]
[19. 5.64 3.05]
[19.6 5.14 3.04]
[20. 5.08 2.77]
[21. 5.69 3.56]
[21. 5.92 3.31]
[21. 5.69 3.67]
[21.3 6.38 3.53]
[22. 6.11 3.41]
[22. 5.64 3.52]
[22. 6.11 3.52]
[22. 5.88 3.52]
[22. 5.52 4. ]
[22.5 5.86 3.62]
[22.5 6.79 3.62]
[22.7 5.95 3.63]
[23. 5.22 3.63]
[23.5 6.28 3.72]
[24. 7.29 3.72]
[24. 6.38 3.82]
[24.6 6.73 4.17]
[25. 6.44 3.68]
[25.6 6.56 4.24]
[26.5 7.17 4.14]
[27.3 8.32 5.14]
[27.5 7.17 4.34]
[27.5 7.05 4.34]
[27.5 7.28 4.57]
[28. 7.82 4.2 ]
[28.7 7.59 4.64]
[30. 7.62 4.77]
[32.8 10.03 6.02]
[34.5 10.26 6.39]
[35. 11.49 7.8 ]
[36.5 10.88 6.86]
[36. 10.61 6.74]
[37. 10.84 6.26]
[37. 10.57 6.37]
[39. 11.14 7.49]
[39. 11.14 6. ]
[39. 12.43 7.35]
[40. 11.93 7.11]
[40. 11.73 7.22]
[40. 12.38 7.46]
[40. 11.14 6.63]
[42. 12.8 6.87]
[43. 11.93 7.28]
[43. 12.51 7.42]
[43.5 12.6 8.14]
[44. 12.49 7.6 ]]
import numpy as np
perch_weight = np.array([5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 110.0,
115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 130.0,
150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 197.0,
218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, 514.0,
556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, 820.0,
850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, 1000.0,
1000.0])
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(perch_full, perch_weight, random_state = 42)
from sklearn.preprocessing import PolynomialFeatures
# degree = 로 항 개수 조정 가능
poly = PolynomialFeatures(degree = 5 , include_bias = False)
poly.fit(train_input)
train_poly = poly.transform(train_input)
print(train_poly.shape)
(42, 55)
train_poly
array([[1.96000000e+01, 5.14000000e+00, 3.04000000e+00, ...,
7.42244501e+02, 4.38992857e+02, 2.59637799e+02],
[2.20000000e+01, 5.88000000e+00, 3.52000000e+00, ...,
1.50793507e+03, 9.02709432e+02, 5.40397483e+02],
[1.87000000e+01, 5.20000000e+00, 3.12000000e+00, ...,
8.21240709e+02, 4.92744425e+02, 2.95646655e+02],
...,
[2.56000000e+01, 6.56000000e+00, 4.24000000e+00, ...,
3.28023719e+03, 2.12015331e+03, 1.37034299e+03],
[4.20000000e+01, 1.28000000e+01, 6.87000000e+00, ...,
5.31239245e+04, 2.85126063e+04, 1.53032504e+04],
[3.45000000e+01, 1.02600000e+01, 6.39000000e+00, ...,
2.74661189e+04, 1.71060916e+04, 1.06537939e+04]])
poly.get_feature_names()
['x0',
'x1',
'x2',
'x0^2',
'x0 x1',
'x0 x2',
'x1^2',
'x1 x2',
'x2^2',
'x0^3',
'x0^2 x1',
'x0^2 x2',
'x0 x1^2',
'x0 x1 x2',
'x0 x2^2',
'x1^3',
'x1^2 x2',
'x1 x2^2',
'x2^3',
'x0^4',
'x0^3 x1',
'x0^3 x2',
'x0^2 x1^2',
'x0^2 x1 x2',
'x0^2 x2^2',
'x0 x1^3',
'x0 x1^2 x2',
'x0 x1 x2^2',
'x0 x2^3',
'x1^4',
'x1^3 x2',
'x1^2 x2^2',
'x1 x2^3',
'x2^4',
'x0^5',
'x0^4 x1',
'x0^4 x2',
'x0^3 x1^2',
'x0^3 x1 x2',
'x0^3 x2^2',
'x0^2 x1^3',
'x0^2 x1^2 x2',
'x0^2 x1 x2^2',
'x0^2 x2^3',
'x0 x1^4',
'x0 x1^3 x2',
'x0 x1^2 x2^2',
'x0 x1 x2^3',
'x0 x2^4',
'x1^5',
'x1^4 x2',
'x1^3 x2^2',
'x1^2 x2^3',
'x1 x2^4',
'x2^5']
test_poly = poly.transform(test_input)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_poly, train_target)
lr.score(train_poly,train_target)
0.9999999999991097
## 규제
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_poly)
train_scaled = ss.transform(train_poly)
test_scaled = ss.transform(test_poly)
## 릿지
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(train_scaled, train_target)
ridge.score(train_scaled, train_target)
0.9896101671037343
print(ridge.score(test_scaled, test_target))
0.9790693977615397
## 적정한 릿지의 하이퍼파라미터를 찾자
train_score = []
test_score = []
alpha_list = [0.001,0.01,0.1,1,10,100]
for alpha in alpha_list:
ridge = Ridge(alpha = alpha)
ridge.fit(train_scaled, train_target)
train_score.append(ridge.score(train_scaled, train_target))
test_score.append(ridge.score(test_scaled, test_target))
plt.plot(np.log10(alpha_list), train_score)
plt.plot(np.log10(alpha_list), test_score)
plt.show()
너무 많은 식을 학습시키면, 과적합이 일어나게 된다. 위 그래프에서는 릿지모델을 이용하여 규제를 걸어주었다.
즉, 다시말하여 모델이 과적합되지 않도록 regulation을 정하는 것이다.
## 라소 규제
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 0.1)
lasso.fit(train_scaled, train_target)
print(lasso.score(train_scaled, train_target))
0.990137631128448
/Users/junhyeoungson/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:531: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 806.2370926333242, tolerance: 518.2793833333334
positive)
## 적정한 라쏘의 하이퍼파라미터를 찾자
train_score = []
test_score = []
alpha_list = [0.001,0.01,0.1,1,10,100]
for alpha in alpha_list:
lasso = Ridge(alpha = alpha)
lasso.fit(train_scaled, train_target)
train_score.append(lasso.score(train_scaled, train_target))
test_score.append(lasso.score(test_scaled, test_target))
plt.plot(np.log10(alpha_list), train_score)
plt.plot(np.log10(alpha_list), test_score)
plt.show()
## 라소에서 유용한 모델을 골라내자
print(np.sum(lasso.coef_ == 0) )
0
!jupyter nbconvert --to markdown 2021-03-14-회귀 알고리즘과 모델 규제.ipynb