예시
from sklearn.preprocessing import Binarizer
X = [[1,0,-1],
[3,0,2],
[0,1.3,1.5]]
# 임계값 1을 기준으로 이진 변환
binarizer = Binarizer(threshold=1)
print(binarizer.fit_transform(X))
[[0. 0. 0.]
[1. 0. 1.]
[0. 1. 1.]]
라이브러리 및 데이터 불러오기
import pandas as pd
import seaborn as sns
import numpy as np
df = sns.load_dataset('diamonds')
df.head()
이진 분류 문제로 변환
df['cut_binary'] = df['cut'].apply(lambda x: 1 if x in ['Premium', 'Ideal'] else 0)
수치형 변수만 사용 (문제를 단순화하기 위함)
df_sp = df[['depth', 'table', 'price', 'x', 'y', 'z', 'cut_binary']]
train/test 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_sp.drop('cut_binary',axis=1), df_sp['cut_binary'], test_size=0.3, random_state=111)
평가 지표 함수 생성
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
def get_clf_eval(y_test, pred):
confusion = confusion_matrix(y_test, pred)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='micro')
recall = recall_score(y_test, pred, average='micro')
print('오차 행렬')
print(confusion)
print(f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}')
모델 학습 (LogisticRegression)
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(max_iter = 500)
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test, pred)
오차 행렬
[[2861 2719]
[1012 9590]]
정확도: 0.7694, 정밀도: 0.7694, 재현율: 0.7694
pred_proba 확인
pred_proba = lr_clf.predict_proba(X_test)
pred_proba()
: 머신러닝 모델, 특히 분류 모델에서 각 클래스에 속할 확률을 예측하는 함수(핵심) 임계값 조정을 위해 Binarizer 선언 및 조정에 따른 결과
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.3)
tt_pred = binarizer.fit_transform(pred_proba)[:,1].reshape(-1, 1)
get_clf_eval(y_test, tt_pred)
오차 행렬
[[ 1311 4269]
[ 128 10474]]
정확도: 0.7283, 정밀도: 0.7283, 재현율: 0.7283
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
thresholds= [0.3,0.4,0.5,0.6,0.7,0.8]
def get_eval_list(y_test, pred_proba, thresholds):
accuracy_tt_list = []
precision_tt_list = []
recall_tt_list = []
f1_tt_list = []
for threshold in thresholds:
binarizer = Binarizer(threshold=threshold)
tt_pred = binarizer.fit_transform(pred_proba)[:,1].reshape(-1, 1)
accuracy_tt = accuracy_score(y_test, tt_pred)
precision_tt = precision_score(y_test, tt_pred)
recall_tt = recall_score(y_test, tt_pred)
f1_tt = f1_score(y_test, tt_pred)
accuracy_tt_list.append(accuracy_tt)
precision_tt_list.append(precision_tt)
recall_tt_list.append(recall_tt)
f1_tt_list.append(f1_tt)
return accuracy_tt_list, precision_tt_list, recall_tt_list, f1_tt_list
def figure_eval(accuracy_tt_list, precision_tt_list, recall_tt_list, f1_tt_list, thresholds):
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
metrics = [accuracy_tt_list, precision_tt_list, recall_tt_list, f1_tt_list]
titles = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
for i, ax in enumerate(axes.flat):
ax.plot(thresholds, metrics[i], marker='o')
ax.set_title(titles[i])
ax.set_xlabel('Threshold')
ax.set_ylabel('Score')
plt.show()
accuracy_list, precision_list, recall_list, f1_list = get_eval_list(y_test, pred_proba, thresholds)
figure_eval(accuracy_list, precision_list, recall_list, f1_list, thresholds)
종속변수
독립 변수 (거리 선택 방식)