💎 탐색한 데이터로 모델성능 개선
🔼 성능 개선
train = df[:614].copy()
test = df[614:].copy()
feature_names2 = train.columns.tolist()
feature_names2.remove("Pregnancies")
feature_names2.remove("Outcome")
label_name = 'Outcome'
X_train = train[feature_names2]
y_train = train[label_name]
X_test = test[feature_names2]
y_test = test[label_name]
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
diff_count = abs(y_test - y_predict).sum()
(len(y_test) - diff_count) / len(y_test) * 100
df["Age_low"] = df["Age"] < 25
df["Age_middle"] = (df["Age"] >= 25) & (df["Age"] <= 60)
df["Age_high"] = df["Age"] > 60
df[["Age", "Age_low", "Age_middle", "Age_high"]].head()
train = df[:614].copy()
test = df[614:].copy()
feature_names3 = train.columns.tolist()
feature_names3.remove("Pregnancies")
feature_names3.remove("Outcome")
label_name = 'Outcome'
X_train = train[feature_names3]
y_train = train[label_name]
X_test = test[feature_names3]
y_test = test[label_name]
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
diff_count = abs(y_test - y_predict).sum()
(len(y_test) - diff_count) / len(y_test) * 100
df = df.drop(["Age_low","Age_middle","Age_high"], axis=1)
df["Insulin_nan"] = df["Insulin"].replace(0, np.nan)
df[["Insulin", "Insulin_nan"]].head()
df.groupby(["Outcome"])["Insulin", "Insulin_nan"].agg(["mean", "median"])
df.loc[(df["Outcome"] == 0) & (df["Insulin_nan"].isnull()), "Insulin_nan"] = 102.5
df.loc[(df["Outcome"] == 1) & (df["Insulin_nan"].isnull()), "Insulin_nan"] = 169.5
train = df[:614].copy()
test = df[614:].copy()
feature_names4 = train.columns.tolist()
feature_names4.remove("Pregnancies")
feature_names4.remove("Insulin")
feature_names4.remove("Outcome")
label_name = 'Outcome'
X_train = train[feature_names4]
y_train = train[label_name]
X_test = test[feature_names4]
y_test = test[label_name]
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
diff_count = abs(y_test - y_predict).sum()
(len(y_test) - diff_count) / len(y_test) * 100
IQR3 = df["Insulin_nan"].quantile(0.75)
IQR1 = df["Insulin_nan"].quantile(0.25)
IQR = IQR3 - IQR1
OUT = IQR3 + (IQR * 1.5)
train = df[:614].copy()
test = df[614:].copy()
train = train[train["Insulin_nan"] < 270]
feature_names5 = train.columns.tolist()
feature_names5.remove("Pregnancies")
feature_names5.remove("Insulin")
feature_names5.remove("Outcome")
label_name = 'Outcome'
X_train = train[feature_names5]
y_train = train[label_name]
X_test = test[feature_names5]
y_test = test[label_name]
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
diff_count = abs(y_test - y_predict).sum()
(len(y_test) - diff_count) / len(y_test) * 100
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df[["Glucose", "DiabetesPedigreeFunction"]])
scale = scaler.transform(df[["Glucose", "DiabetesPedigreeFunction"]])
df[["Glucose", "DiabetesPedigreeFunction"]] = scale
train = df[:614].copy()
test = df[614:].copy()
feature_names6 = train.columns.tolist()
feature_names6.remove("Pregnancies")
feature_names6.remove("Insulin")
feature_names6.remove("Outcome")
label_name = 'Outcome'
X_train = train[feature_names6]
y_train = train[label_name]
X_test = test[feature_names6]
y_test = test[label_name]
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
diff_count = abs(y_test - y_predict).sum()
(len(y_test) - diff_count) / len(y_test) * 100