본 내용은 혼자 공부하는 머신러닝 코드에 출처가 있음을 밝힙니다.
# 데이터를 불러오는 코드
import pandas as pd
fish = pd.read_csv('https://bit.ly/fish_csv_data')
fish.head()
Species | Weight | Length | Diagonal | Height | Width | |
---|---|---|---|---|---|---|
0 | Bream | 242.0 | 25.4 | 30.0 | 11.5200 | 4.0200 |
1 | Bream | 290.0 | 26.3 | 31.2 | 12.4800 | 4.3056 |
2 | Bream | 340.0 | 26.5 | 31.1 | 12.3778 | 4.6961 |
3 | Bream | 363.0 | 29.0 | 33.5 | 12.7300 | 4.4555 |
4 | Bream | 430.0 | 29.0 | 34.0 | 12.4440 | 5.1340 |
print(pd.unique(fish['Species']))
['Bream' 'Roach' 'Whitefish' 'Parkki' 'Perch' 'Pike' 'Smelt']
# 생선 input과 target을 나눈다.
fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()
fish_target = fish['Species'].to_numpy()
fish_input.shape
(159, 5)
# train test data split
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(fish_input, fish_target, random_state = 42)
# standard scaler 사용하여 번환
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input )
test_scaled = ss.transform(test_input)
데이터의 크기가 feature에 따라 다르기 때문에 standardscaler로 크기 변환을 해준다. 이는 거리를 재는 머신러닝에서 특히 유용하게 쓰인다.
# KNN으로 클래스 분류해보기
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, train_target)
print(kn.score(train_scaled, train_target))
print(kn.score(test_scaled, test_target))
print(kn.classes_)
0.8907563025210085
0.85
['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']
print(kn.predict(test_scaled[:10]))
['Perch' 'Smelt' 'Pike' 'Perch' 'Perch' 'Bream' 'Smelt' 'Roach' 'Perch'
'Pike']
# 어떻게 예측한 것일까?(확률)
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba, decimals = 4)) # 소수 4번째 자리에서 표기
[[0. 0. 1. 0. 0. 0. 0. ]
[0. 0. 0. 0. 0. 1. 0. ]
[0. 0. 0. 1. 0. 0. 0. ]
[0. 0. 0.6667 0. 0.3333 0. 0. ]
[0. 0. 0.6667 0. 0.3333 0. 0. ]]
distance, indexes = kn.kneighbors(test_scaled[3:4])
print(train_target[indexes])
[['Roach' 'Perch' 'Perch']]
## 시그모이드 함수 그래프 그리기
import numpy as np
import matplotlib.pyplot as plt
plt.style.use(['seaborn'])
z = np.arange(-5, 5, 0.1)
phi = 1 / (1 + np.exp(-z))
plt.plot(z, phi)
plt.xlabel('Z')
plt.ylabel('PHI')
plt.show()
# 사이킷런은 0.5일 때, 음성 클래스로 판단한다.
## 로지스틱 회귀로 이진분류 수행하기
char_arr = np.array(['A','B','C','D','E'])
print(char_arr[[True, False,True, False,False]])
['A' 'C']
bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')
train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt = train_target[bream_smelt_indexes]
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt)
lr.predict(train_bream_smelt)
array(['Bream', 'Smelt', 'Bream', 'Bream', 'Bream', 'Smelt', 'Bream',
'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Smelt',
'Bream', 'Smelt', 'Smelt', 'Bream', 'Bream', 'Bream', 'Bream',
'Bream', 'Bream', 'Bream', 'Bream', 'Smelt', 'Bream', 'Smelt',
'Smelt', 'Bream', 'Smelt', 'Bream', 'Bream'], dtype=object)
lr.predict_proba(train_bream_smelt[:5])
array([[0.99759855, 0.00240145],
[0.02735183, 0.97264817],
[0.99486072, 0.00513928],
[0.98584202, 0.01415798],
[0.99767269, 0.00232731]])
print(lr.classes_)
print(lr.coef_, lr.intercept_)
['Bream' 'Smelt']
[[-0.4037798 -0.57620209 -0.66280298 -1.01290277 -0.73168947]] [-2.16155132]
lr = LogisticRegression(C=20, max_iter = 1000)
lr.fit(train_scaled, train_target)
print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled,test_target))
0.9327731092436975
0.925
proba = lr.predict_proba(test_scaled[:5])
print(np.round(proba, decimals =3))
print(lr.classes_)
[[0. 0.014 0.841 0. 0.136 0.007 0.003]
[0. 0.003 0.044 0. 0.007 0.946 0. ]
[0. 0. 0.034 0.935 0.015 0.016 0. ]
[0.011 0.034 0.306 0.007 0.567 0. 0.076]
[0. 0. 0.904 0.002 0.089 0.002 0.001]]
['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']
fish
Species | Weight | Length | Diagonal | Height | Width | |
---|---|---|---|---|---|---|
0 | Bream | 242.0 | 25.4 | 30.0 | 11.5200 | 4.0200 |
1 | Bream | 290.0 | 26.3 | 31.2 | 12.4800 | 4.3056 |
2 | Bream | 340.0 | 26.5 | 31.1 | 12.3778 | 4.6961 |
3 | Bream | 363.0 | 29.0 | 33.5 | 12.7300 | 4.4555 |
4 | Bream | 430.0 | 29.0 | 34.0 | 12.4440 | 5.1340 |
... | ... | ... | ... | ... | ... | ... |
154 | Smelt | 12.2 | 12.2 | 13.4 | 2.0904 | 1.3936 |
155 | Smelt | 13.4 | 12.4 | 13.5 | 2.4300 | 1.2690 |
156 | Smelt | 12.2 | 13.0 | 13.8 | 2.2770 | 1.2558 |
157 | Smelt | 19.7 | 14.3 | 15.2 | 2.8728 | 2.0672 |
158 | Smelt | 19.9 | 15.0 | 16.2 | 2.9322 | 1.8792 |
159 rows × 6 columns
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(fish_input, fish_target, random_state = 42)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)
# 확률적 경사 하강법
from sklearn.linear_model import SGDClassifier
sc = SGDClassifier(loss = 'log', max_iter = 10, random_state = 42)
sc.fit(train_scaled, train_target)
print(sc.score(train_scaled, train_target))
print(sc.score(test_scaled, test_target))
0.773109243697479
0.775
/Users/junhyeoungson/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_stochastic_gradient.py:573: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
ConvergenceWarning)
# 모델을 계속 훈련
sc.partial_fit(train_scaled, train_target)
print(sc.score(train_scaled, train_target))
print(sc.score(test_scaled,test_target))
0.8319327731092437
0.8
# 모델 정확도 그래프
sc = SGDClassifier(loss= 'log', random_state = 42)
train_score = []
test_score = []
classes = np.unique(train_target)
for _ in range(0, 300):
sc.partial_fit(train_scaled, train_target, classes = classes)
train_score.append(sc.score(train_scaled, train_target))
test_score.append(sc.score(test_scaled, test_target))
plt.plot(train_score)
plt.plot(test_score)
[<matplotlib.lines.Line2D at 0x7fcb9bfd4090>]
sc = SGDClassifier(loss= 'log', random_state = 42,tol = None, max_iter = 100) # tol - 향상될 최솟값 지정
sc.fit(train_scaled, train_target)
print(sc.score(train_scaled, train_target))
print(sc.score(test_scaled, test_target))
0.957983193277311
0.925
# 힌지 사용
sc = SGDClassifier(loss= 'hinge', random_state = 42,tol = None, max_iter = 100) # tol - 향상될 최솟값 지정
sc.fit(train_scaled, train_target)
print(sc.score(train_scaled, train_target))
print(sc.score(test_scaled, test_target))
0.9495798319327731
0.925