다양한 분류 알고리즘

ManSonCoding·2021년 4월 1일
0

본 내용은 혼자 공부하는 머신러닝 코드에 출처가 있음을 밝힙니다.

로지스틱 회귀

# 데이터를 불러오는 코드
import pandas as pd
fish = pd.read_csv('https://bit.ly/fish_csv_data')
fish.head()
Species Weight Length Diagonal Height Width
0 Bream 242.0 25.4 30.0 11.5200 4.0200
1 Bream 290.0 26.3 31.2 12.4800 4.3056
2 Bream 340.0 26.5 31.1 12.3778 4.6961
3 Bream 363.0 29.0 33.5 12.7300 4.4555
4 Bream 430.0 29.0 34.0 12.4440 5.1340
print(pd.unique(fish['Species']))
['Bream' 'Roach' 'Whitefish' 'Parkki' 'Perch' 'Pike' 'Smelt']
# 생선 input과 target을 나눈다. 
fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()
fish_target = fish['Species'].to_numpy()
fish_input.shape
(159, 5)
# train test data split
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(fish_input, fish_target, random_state = 42)
# standard scaler 사용하여 번환
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input )
test_scaled = ss.transform(test_input)

데이터의 크기가 feature에 따라 다르기 때문에 standardscaler로 크기 변환을 해준다. 이는 거리를 재는 머신러닝에서 특히 유용하게 쓰인다.

# KNN으로 클래스 분류해보기
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, train_target)
print(kn.score(train_scaled, train_target))
print(kn.score(test_scaled, test_target))
print(kn.classes_)
0.8907563025210085
0.85
['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']
print(kn.predict(test_scaled[:10]))
['Perch' 'Smelt' 'Pike' 'Perch' 'Perch' 'Bream' 'Smelt' 'Roach' 'Perch'
 'Pike']
# 어떻게 예측한 것일까?(확률)
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba, decimals = 4)) # 소수 4번째 자리에서 표기
[[0.     0.     1.     0.     0.     0.     0.    ]
 [0.     0.     0.     0.     0.     1.     0.    ]
 [0.     0.     0.     1.     0.     0.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]]
distance, indexes = kn.kneighbors(test_scaled[3:4])
print(train_target[indexes])
[['Roach' 'Perch' 'Perch']]
## 시그모이드 함수 그래프 그리기
import numpy as np
import matplotlib.pyplot as plt
plt.style.use(['seaborn'])
z = np.arange(-5, 5, 0.1)
phi = 1 / (1 + np.exp(-z))
plt.plot(z, phi)
plt.xlabel('Z')
plt.ylabel('PHI')
plt.show()
# 사이킷런은 0.5일 때, 음성 클래스로 판단한다.

png

## 로지스틱 회귀로 이진분류 수행하기
char_arr = np.array(['A','B','C','D','E'])
print(char_arr[[True, False,True, False,False]])
['A' 'C']
bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')
train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt = train_target[bream_smelt_indexes]
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt)
lr.predict(train_bream_smelt)
array(['Bream', 'Smelt', 'Bream', 'Bream', 'Bream', 'Smelt', 'Bream',
       'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Bream', 'Smelt',
       'Bream', 'Smelt', 'Smelt', 'Bream', 'Bream', 'Bream', 'Bream',
       'Bream', 'Bream', 'Bream', 'Bream', 'Smelt', 'Bream', 'Smelt',
       'Smelt', 'Bream', 'Smelt', 'Bream', 'Bream'], dtype=object)
lr.predict_proba(train_bream_smelt[:5])
array([[0.99759855, 0.00240145],
       [0.02735183, 0.97264817],
       [0.99486072, 0.00513928],
       [0.98584202, 0.01415798],
       [0.99767269, 0.00232731]])
print(lr.classes_)
print(lr.coef_, lr.intercept_)
['Bream' 'Smelt']
[[-0.4037798  -0.57620209 -0.66280298 -1.01290277 -0.73168947]] [-2.16155132]
lr = LogisticRegression(C=20, max_iter = 1000)
lr.fit(train_scaled, train_target)
print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled,test_target))
0.9327731092436975
0.925
proba = lr.predict_proba(test_scaled[:5])
print(np.round(proba, decimals =3))
print(lr.classes_)
[[0.    0.014 0.841 0.    0.136 0.007 0.003]
 [0.    0.003 0.044 0.    0.007 0.946 0.   ]
 [0.    0.    0.034 0.935 0.015 0.016 0.   ]
 [0.011 0.034 0.306 0.007 0.567 0.    0.076]
 [0.    0.    0.904 0.002 0.089 0.002 0.001]]
['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']

손실함수

fish
Species Weight Length Diagonal Height Width
0 Bream 242.0 25.4 30.0 11.5200 4.0200
1 Bream 290.0 26.3 31.2 12.4800 4.3056
2 Bream 340.0 26.5 31.1 12.3778 4.6961
3 Bream 363.0 29.0 33.5 12.7300 4.4555
4 Bream 430.0 29.0 34.0 12.4440 5.1340
... ... ... ... ... ... ...
154 Smelt 12.2 12.2 13.4 2.0904 1.3936
155 Smelt 13.4 12.4 13.5 2.4300 1.2690
156 Smelt 12.2 13.0 13.8 2.2770 1.2558
157 Smelt 19.7 14.3 15.2 2.8728 2.0672
158 Smelt 19.9 15.0 16.2 2.9322 1.8792

159 rows × 6 columns

from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(fish_input, fish_target, random_state = 42)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)
# 확률적 경사 하강법
from sklearn.linear_model import SGDClassifier
sc = SGDClassifier(loss = 'log', max_iter = 10, random_state = 42)
sc.fit(train_scaled, train_target)
print(sc.score(train_scaled, train_target))
print(sc.score(test_scaled, test_target))
0.773109243697479
0.775


/Users/junhyeoungson/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_stochastic_gradient.py:573: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  ConvergenceWarning)
# 모델을 계속 훈련
sc.partial_fit(train_scaled, train_target)
print(sc.score(train_scaled, train_target))
print(sc.score(test_scaled,test_target))
0.8319327731092437
0.8
# 모델 정확도 그래프

sc = SGDClassifier(loss= 'log', random_state = 42)
train_score = []
test_score = []
classes = np.unique(train_target)
for _ in range(0, 300):
    sc.partial_fit(train_scaled, train_target, classes = classes)
    train_score.append(sc.score(train_scaled, train_target))
    test_score.append(sc.score(test_scaled, test_target))
plt.plot(train_score)
plt.plot(test_score)
[<matplotlib.lines.Line2D at 0x7fcb9bfd4090>]

png

sc = SGDClassifier(loss= 'log', random_state = 42,tol = None, max_iter = 100) # tol - 향상될 최솟값 지정
sc.fit(train_scaled, train_target)
print(sc.score(train_scaled, train_target))
print(sc.score(test_scaled, test_target))

0.957983193277311
0.925
# 힌지 사용
sc = SGDClassifier(loss= 'hinge', random_state = 42,tol = None, max_iter = 100) # tol - 향상될 최솟값 지정
sc.fit(train_scaled, train_target)
print(sc.score(train_scaled, train_target))
print(sc.score(test_scaled, test_target))


0.9495798319327731
0.925
profile
AlwaysILearned

0개의 댓글