https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
import numpy as np
import pandas as pd
# dataset -> wine
from sklearn.datasets import load_wine
# model - logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
wine = load_wine()
print(wine['DESCR'])
✅Number of Instances: 178
✅Number of Attributes: 13 numeric, predictive attributes and the class
✅Attribute Information: (feature)
- Alcohol
- Malic acid
- Ash
- Alcalinity of ash
- Magnesium
- Total phenols
- Flavanoids
- Nonflavanoid phenols
- Proanthocyanins
- Color intensity
- Hue
- OD280/OD315 of diluted wines
- Proline
✅class: class_0 / class_1 / class_2
X = wine['data']
y = wine['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state= 42)
from sklearn.neighbors import KNeighborsClassifier
knn = Pipeline([('scaler', StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors = 5))])
knn.fit(X_train, y_train)
print('train', knn.score(X_train, y_train))
print('test', knn.score(X_test, y_test))
train
- 0.9774436090225563test
- 0.9555555555555556proba = knn.predict_proba(X_test)
print(proba[:5])
# 결과
[1. 0. 0.]
[1. 0. 0.]
[0. 0. 1.]
[1. 0. 0.]
[0. 1. 0.]
select_index = (y_train == 0) | (y_train == 1)
X_train_select = X_train[select_index]
y_train_select = y_train[select_index]
lr_pipe = Pipeline([('scaler', StandardScaler()),
('lr', LogisticRegression( C= 20, max_iter= 1000))])
lr_pipe.fit(X_train_select, y_train_select)
lr_pipe.score(X_train_select, y_train_select)
## pipeline은 각 과정이 dict 형식으로 저장되기 때문에 ['lr']을 붙이고 확인
print(lr_pipe['lr'].coef_, lr_pipe['lr'].intercept_)
decisions = lr_pipe.decision_function(X_train_select[:10])
decisions
✅ sigmoid 함수(logistic 함수)에 적용하기 이전의 결과값
해당 결과를 sigmoid함수를 통해 확률로 변환
from scipy.special import expit
print(np.round(expit(decisions), 3))
[0. 1. 1. 0. 1. 0. 0. 0.999 0.996 0. ]
np.round(lr_pipe.predict_proba(X_train_select[:10]), 3)
[1. , 0. ],
[0. , 1. ],
[0. , 1. ],
[1. , 0. ],
[0. , 1. ],
[1. , 0. ],
[1. , 0. ],
[0.001, 0.999],
[0.004, 0.996],
[1. , 0. ]
✅ 각 사례가 class 0, 1로 선택될 probability
lr_pipe.predict(X_train_select[0:10])
[0, 1, 1, 0, 1, 0, 0, 1, 1, 0]
✅ 분석결과 class 분류
lr_pipe.fit(X_train, y_train)
print('train', lr_pipe.score(X_train, y_train))
print('test', lr_pipe.score(X_test, y_test))
train
-> 1.0
test
-> 0.9777777777777777
decision = lr_pipe.decision_function(X_test[:5])
print(np.round(decision, 2))
[ 7.6 -4.46 -3.14]
[ 7.58 -7. -0.57]
[-2.26 -2.6 4.85]
[ 6.82 -2.52 -4.3 ]
[-2.55 8.22 -5.67]
✅ sigmoid 함수(logistic 함수)에 적용하기 이전의 결과값
해당 결과를 sigmoid함수를 통해 확률로 변환
proba1 = lr_pipe.predict_proba(X_train[0:5])
print(np.round(proba1, 3))
[1. 0. 0. ]
[0. 1. 0. ]
[0. 1. 0. ]
[0. 0.004 0.996]
[1. 0. 0. ]
✅ softmax 함수는 다항 로지스틱 회귀에서 사용
각 class별 probability를 계산하고, 이를 전체 class 중에서 특정 class로 분류할 probability를 계산
class 별 확률 분포를 계산
from scipy.special import softmax
proba = softmax(decision, axis=1)
print(np.round(proba, decimals=3))
[1. 0. 0. ]
[0. 1. 0. ]
[0. 1. 0. ]
[0. 0.004 0.996]
[1. 0. 0. ]
print(lr_pipe['lr'].coef_, lr_pipe['lr'].intercept_)
coefficient
intercept
✅ coef가 가장 큰 순서로 확인하기
import pandas as pd
pd.Series(lr_pipe['lr'].coef_[0], wine['feature_names']).sort_values(ascending=False)
# 결과
alcalinity_of_ash 2.617976
hue 0.735303
proanthocyanins 0.236070
flavanoids 0.056020
nonflavanoid_phenols -0.191370
magnesium -0.285169
total_phenols -0.613547
malic_acid -0.616823
od280/od315_of_diluted_wines -1.010078
color_intensity -1.405165
ash -2.173627
alcohol -3.055948
proline -3.127452
from sklearn.model_selection import GridSearchCV
lr_pipe3 = Pipeline([('scaler', StandardScaler()),
('lr', LogisticRegression(max_iter= 1000))])
params={'lr__C': [0.001, 0.01, 0.1, 1, 10, 100]}
gs = GridSearchCV(lr_pipe3,
param_grid=params,
cv=5,
n_jobs=-1) # cv = fold 수 / default = 3)
gs.fit(X_train, y_train)
print('best params', gs.best_params_)
print('best score', gs.best_score_)
print('estimator', gs.best_estimator_)
best params
-> {'lr__C': 10}
best score
-> 0.9849002849002849
estimator
-> Pipeline(steps=[('scaler', StandardScaler()), ('lr', LogisticRegression(C=10, max_iter=1000))])
lr_final = gs.best_estimator_
lr_final.fit(X_train, y_train)
print('train set', lr_final.score(X_train, y_train))
print('test set', lr_final.score(X_test, y_test))
-train set
1.0
-test set
0.9777777777777777
✅ GridSearchCV의 Best Score와 최종 모델링의 Score가 다른 이유는
decision3 = lr_final.decision_function(X_test[:5])
print(np.round(decision3, 2))
[ 6.82 -3.82 -3. ]
[ 6.77 -6.02 -0.75]
[-2.14 -2.14 4.28]
[ 6.16 -2.08 -4.08]
[-2.28 7.26 -4.98]
proba = lr_final.predict_proba(X_test[0:5])
print(np.round(proba, 3))
[1. 0. 0. ]
[0.999 0. 0.001]
[0.002 0.002 0.997]
[1. 0. 0. ]
[0. 1. 0. ]
from scipy.special import softmax
proba1 = softmax(decision3, axis=1)
print(np.round(proba1, decimals=3))
[1. 0. 0. ]
[0.999 0. 0.001]
[0.002 0.002 0.997]
[1. 0. 0. ]
[0. 1. 0. ]
print(np.round(lr_final.predict(X_test[0:5]),3))
-> [0 0 2 0 1]