21.2.19 / scikit-learn / 강의 수강 및 복습, kaggle 파마 인디언 당뇨병 예측

pjk·2021년 2월 19일

pandas python scikit learn sklearn 매일코딩 머신러닝

[매일코딩 스터디]

목록 보기

19/62

Today

강의

파이썬 머신러닝 완벽 가이드 (목표 진도 끝)

스터디 내용

회귀 파트 강의 마무리
데이터 전처리
교차검증
평가
회귀

결과

# 라이브러리 임포트
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 학습, 테스트 세트 구분
diabetes_data = pd.read_csv('diabetes.csv')

x = diabetes_data.iloc[:,:-1]
y = diabetes_data.iloc[:,-1]

scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)

lr = LogisticRegression()

xtrain, xtest, ytrain, ytest = train_test_split(x_scaled, y, test_size = 0.2, stratify=y)

# 평가 함수, 정밀도-재현율 커브 plot 함수 강의 소스코드 사용

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))
    
def precision_recall_curve_plot(y_test=None, pred_proba_c1=None):
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()
    
# 알고리즘에 데이터 학습 및 시각화    

lr.fit(xtrain, ytrain)
pred = lr.predict(xtest)
pred_proba = lr.predict_proba(xtest)[:,1]
confusion_matrix(ytest, pred)
precision_recall_curve_plot(ytest, pred_proba) 

fpr, tpr, thresholds = roc_curve(ytest, pred_proba)
plt.plot(fpr, tpr)
roc_auc_score(ytest, pred_proba)

# 데이터 속 0 값 수정

zero_features = ['Glucose', 'BloodPressure','SkinThickness','Insulin','BMI']
diabetes_data[zero_features]=diabetes_data[zero_features].replace(0, diabetes_data[zero_features].mean())

X = diabetes_data.iloc[:, :-1]
y = diabetes_data.iloc[:, -1]

scaler = StandardScaler( )
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2,stratify=y)

lr_clf = LogisticRegression()
lr_clf.fit(X_train , y_train)
pred = lr_clf.predict(X_test)

pred_proba = lr_clf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test , pred, pred_proba)