데이터셋 출처
데이터 구성
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("http://bit.ly/data-diabetes-csv")
insulin_median = df[df["Insulin"] > 0].groupby("Outcome")["Insulin"].median()
df["Insulin_fill"] = df["Insulin"]
df.loc[(df["Outcome"] == 0) & (df["Insulin_fill"] == 0), "Insulin_fill"] = insulin_median[0]
df.loc[(df["Outcome"] == 1) & (df["Insulin_fill"] == 0), "Insulin_fill"] = insulin_median[1]
label_name = "Outcome"
feature_names = df.columns.tolist()
feature_names.remove("Insulin")
feature_names.remove("Outcome")
feature_names: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Insulin_fill']
X = df[feature_names]
y = df[label_name]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
Scikit-learn DecisionTreeClassifier
DecisionTreeClassifier(
*,
criterion='gini',
splitter='best',
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features=None,
random_state=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
class_weight=None,
ccp_alpha=0.0,
)
GridSearchCV(estimator,
param_grid,
*,
scoring=None,
n_jobs=None,
refit=True,
cv=None,
verbose=0,
pre_dispatch='2*n_jobs',
error_score=nan,
return_train_score=False)
# 트리의 깊이 설정
max_depth = list(range(3, 20, 2))
# max_features의 비율 설정
max_features = [0.3, 0.5, 0.7, 0.8, 0.9]
# 딕셔너리 형태로 저장
parameters = {"max_depth":max_depth, "max_features":max_features}
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(model, parameters, n_jobs=-1, cv=5)
clf.fit(X_train, y_train)
bestscore : 예측 정확도
clf.best_score_
bestestimator : 최고 성능 모델
clf.best_estimator_
pd.DataFrame(clf.cv_results_).sort_values("rank_test_score")
{
'param_kernel', 'param_gamma', 'param_degree',
'split0_test_score', 'split1_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score',
'split0_train_score', 'split1_train_score', 'mean_train_score', 'std_train_score',
'mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'params'
}
Scikit-learn RandomizedSearchCV
RandomizedSearchCV(estimator,
param_distributions,
*,
n_iter=10,
scoring=None,
n_jobs=None,
refit=True,
cv=None,
verbose=0,
pre_dispatch='2*n_jobs',
random_state=None
error_score=nan,
return_train_score=False)
from sklearn.model_selection import RandomizedSearchCV
param_distributions = {"max_depth":np.random.randint(3, 20, 10), "max_features":np.random.uniform(0.5, 1, 10)}
clfr = RandomizedSearchCV(model, param_distributions=param_distributions, scoring="accuracy", n_jobs=-1, n_iter=10, cv=5, random_state=42, verbose=1)
clfr.fit(X_train, y_train)
clfr.best_estimator_
clfr.best_params_
clfr.best_score_
pd.DataFrame(clfr.cv_results_).nsmallest(5, "rank_test_score")
GridSearchCV
best_model = clf.best_estimator_
RandomizedSearchCV
best_model = clfr.best_estimator_
데이터를 머신러닝 모델로 학습시킨다.
best_model.fit(X_train, y_train)
y_predict = best_model.predict(X_test)
best_model.feature_importances_
sns.barplot(x=best_model.feature_importances_, y=best_model.feature_names_in_)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))