
import matplotlib.pyplot as plt import numpy as npdef logistic_func(X): return 1 / (1 + np.exp(-X))X = np.linspace(-10, 10, 1000) y = logistic_func(X) plt.figure(figsize=(13, 6)) plt.plot(X, y, color='b', linewidth=2) # y 위치에 수평선을 그리는 함수. # x 위치에 수직선을 그리는 함수(axvline(x=위치)) plt.axhline(y=0.5, color='r', linestyle=':') plt.ylim(-0.15, 1.15) # y축 범위 지정. plt.yticks(np.arange(-0.1,1.2,0.1)) ax = plt.gca() ax.spines['left'].set_position("center") # spine의 위치를 변경. - 상수 ax.spines['bottom'].set_position(('data', 0)) # 위치 변경 - 이동시킬 위치 값을 지정. ax.spines['top'].set_position(("data", 1)) ax.spines['right'].set_visible(False)# spine을 안보이게 처리. plt.show()
np.min(y), np.max(y)(4.5397868702434395e-05, 0.9999546021312976)
"양성" if logistic_func(100) > 0.5 else "음성"# pos'양성'

- Loss Function
- 모델이 예측한 값과 정답간의 차이(오차, loss)를 구하는 함수.
- 모델의 파라미터를 최적화할 때 loss를 최소화하는 것을 목적으로 한다.
np.log(모델이 예측한 정답에 대한 확률)
정답: 1(pos), pos확률: 0.7, neg 확률: 0.3 => log(0.7)
정답: 0(neg), pos확률: 0.7, neg 확률: 0.3 => log(0.3)
import numpy as np import matplotlib.pyplot as plt X = np.linspace(0.000000001, 1, 100) # 정답의 확률(X값) y = -np.log(X) # 오차(log loss) plt.figure(figsize=(10,8)) plt.plot(X, y) plt.axvline(0.5, linestyle=':', linewidth=2, color='r') plt.xticks(np.arange(0,1.1,0.1)) plt.yticks([0,1,2,3,4,5,10,20]) plt.gca().spines['bottom'].set_position(("data", 0)) plt.show()
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression pipeline = Pipeline([ ("scaler", StandardScaler()), ("model", LogisticRegression(random_state=0)) ]) pipeline.fit(X_train, y_train)# 평가 pred_train = pipeline.predict(X_train) pred_test = pipeline.predict(X_test) pred_train_proba = pipeline.predict_proba(X_train) pred_test_proba = pipeline.predict_proba(X_test)from metrics import print_binary_classification_metrics print_binary_classification_metrics(y_train, pred_train, pred_train_proba[:, 1]) print_binary_classification_metrics(y_test, pred_test, pred_test_proba[:, 1])정확도: 0.989010989010989
재현율: 0.9929824561403509
정밀도: 0.9895104895104895
F1 점수: 0.9912434325744308
Average Precision: 0.9985893579760078
ROC-AUC Score: 0.9979153766769865
정확도: 0.9824561403508771
재현율: 1.0
정밀도: 0.972972972972973
F1 점수: 0.9863013698630136
Average Precision: 0.9974301219609739
ROC-AUC Score: 0.9957010582010581
from sklearn.model_selection import GridSearchCV params = { "model__C": [0.01, 0.1, 1, 10], } gs = GridSearchCV( pipeline, params, scoring="accuracy", cv=4, n_jobs=-1 ) gs.fit(X_train, y_train)gs.best_score_0.9736259897531439
gs.best_params_{'model__C': 1}