f1_score()
API 제공from sklearn.metrics import f1_score
f1 = f1_score(y_test, pred)
print('F1 스코어 : {0:.4f}'.format(f1))
FPR = FP/(FP+TN) = 1 - TNR = 1 - 특이성
roc_curve()
APIprecision_recall_curve()
API와 사용법 유사 <-> 반환값 구성이 다름y_true
= 실제 클래스값 array, 형상=[데이터 건수]y_score
= predict_proba()의 반환값 배열에서 positive 칼럼의 에측 확률이 보통 사용됨, 형상=[n_smaples]fpr
= fpr을 배열로 반환tpr
= tpr을 배열로 반환thresholds
= threshold 배열### 타이타닉 생존자 예측 데이터셋 이용
from sklearn.metrics import roc_curve
#레이블값이 1일 때의 예측 확률을 추출
pred_proba_class1 = lr_clf.predict_proba(x_test)[:, 1]
fprs, tprs, thresholds = roc_curve(y_test, pred_proba_class1)
#반환된 임곗값 배열에서 샘플로 데이터 추출하되, 임곗값을 5 step으로 추출
#thresholds[0]은 max(예측확률)+1로 임의 설정됨. 이를 제외하기 위해 np.arange를 1로 시작
thr_index = np.arange(1, thresholds.shape[0], 5)
print('샘플 추출을 위한 임곗값 배열의 index : ', thr_index)
print('샘플 index로 추출한 임곗값 : ', np.round(thresholds[thr_index], 2))
#5step 단위로 추출된 임곗값에 따른 FPR, TPR 값
print('샘플 임곗값 별 FPR : ', np.round(fprs[thr_index], 3))
print('샘플 임곗값 별 TPR : ', np.round(tprs[thr_index], 3))
### ROC 곡선 시각화
import matplotlib.pyplot as plt
def roc_curve_plot(y_test, pred_proba_class1):
fprs, tprs, thresholds = roc_curve(y_test, pred_proba_class1)
plt.plot(fprs, tprs, label="ROC")
plt.plot([0, 1], [0, 1], 'k--', label="Random")
start, end = plt.xlim()
plt.xticks(np.round(np.arange(start, end, 0.1), 2))
plt.xlim(0, 1); plt.ylim(0, 1)
plt.xlabel('FPR(1-Sensitivity)'); plt.ylabel('TPR(Recall)')
plt.legend()
roc_curve_plot(y_test, pred_proba_class1)
from sklearn.metrics import roc_auc_score
pred_proba = lr_clf.predict_proba(x_test)[:, 1]
roc_score = roc_auc_score(y_test, pred_proba)
print('ROC AUC 값 : {0:.4f}'.format(roc_score))
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
def get_clf_eval(y_test, pred=None, pred_proba=None):
confusion = confusion_matrix(y_test, pred)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
roc_auc = roc_auc_score(y_test, pred_proba)
print('오차행렬 : ')
print(confusion)
print('정확도 : {0:.4f} , 정밀도 : {1:.4f} , 재현율 : {2:.4f} , F1 : {3:.4f}, AUC : {4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Binarizer
def get_clf_eval(y_test, pred=None, pred_proba=None):
confusion = confusion_matrix(y_test, pred)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
roc_auc = roc_auc_score(y_test, pred_proba)
print('오차행렬 : ')
print(confusion)
print('정확도 : {0:.4f} , 정밀도 : {1:.4f} , 재현율 : {2:.4f} , F1 : {3:.4f}, AUC : {4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))
def precision_recall_curve_plot(y_test, pred_proba_c1):
#threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출
precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_c1)
#x축을 threshold값으로, y축을 정밀도, 재현율 값으로 각각 plot 수행. 정밀도는 점선
plt.figure(figsize=(8, 6))
threshold_boundary = thresholds.shape[0]
plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
plt.plot(thresholds, recalls[0:threshold_boundary], label='recall')
#threshold값 x축의 스케일을 0.1 단위로 변경
start, end = plt.xlim()
plt.xticks(np.round(np.arange(start, end, 0.1), 2))
#각 축 라벨과 범례, 그리드 설정
plt.xlabel("Threshold value")
plt.ylabel("Precision and Recall Value")
plt.legend()
plt.grid()
plt.show()
def get_eval_by_threshold(y_test, pred_proba_c1, thresholds):
for custom_threshold in thresholds:
binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
custom_predict = binarizer.transform(pred_proba_c1)
print("임곗값 : ", custom_threshold)
get_clf_eval(y_test, custom_predict, pred_proba_c1)
#데이터 로딩 및 outcome 개수 확인
diabets_data = pd.read_csv('../kaggle/pima_indians_diabets/diabetes.csv')
print(diabets_data['Outcome'].value_counts())
diabets_data.head(3)
# 피처 타입과 null 개수 확인
diabets_data.info()
x = diabets_data.iloc[:, :-1]
y = diabets_data.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=156, stratify=y)
lr_clf = LogisticRegression()
lr_clf.fit(x_train, y_train)
pred = lr_clf.predict(x_test)
pred_proba = lr_clf.predict_proba(x_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)
전체 데이터의 65프로가 Negative이므로 정확도보다 재현율에 초점 맞춰 변경해보자
pred_proba_c1 = lr_clf.predict_proba(x_test)[:, 1]
precision_recall_curve_plot(y_test, pred_proba_c1)
diabets_data.describe()
#0값을 검사할 피처명 리스트
zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
#전체 데이터 건수
total_count = diabets_data['Glucose'].count()
#피처별로 반복하면서 데이터값이 0인 데이터 건수를 추출하고, 퍼센트 계산
for feature in zero_features:
zero_count = diabets_data[diabets_data[feature] == 0][feature].count()
print('{0} 0 건수는 {1}, 퍼센트는 {2:.2f}%'.format(feature, zero_count, 100*zero_count/total_count))
mean_zero_features = diabets_data[zero_features].mean()
diabets_data[zero_features] = diabets_data[zero_features].replace(0, mean_zero_features)
#변환값에 대해 피처 스케일링 적용해 변환
x = diabets_data.iloc[:, :-1]
y = diabets_data.iloc[:, -1]
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=156, stratify=y)
lr_clf = LogisticRegression()
lr_clf.fit(x_train, y_train)
pred = lr_clf.predict(x_test)
pred_proba = lr_clf.predict_proba(x_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)
thresholds = [0.3, 0.33, 0.36, 0.39, 0.42, 0.45, 0.48, 0.50]
pred_proba = lr_clf.predict_proba(x_test)
get_eval_by_threshold(y_test, pred_proba[:, 1].reshape(-1, 1), thresholds)
binarizer = Binarizer(threshold=0.48)
pred_th_048 = binarizer.fit_transform(pred_proba[:, 1].reshape(-1, 1))
get_clf_eval(y_test, pred_th_048, pred_proba[:, 1])