오늘은 최근들어 가장 컨디션이 안 좋은 날이었다. 그래도 최대한 내용들의 이미지는 가져가려고 집중을 했기 때문에 주말부터 시작되는 연휴 간 복습을 꾸준히 해야겠다.
df.head()

df.info()
df.isna().sum()
df.describe().round(3)
quality = 서열형df['quality'].value_counts().sort_index()
# quality
# 3 20
# 4 163
# 5 1457
# 6 2198
# 7 880
# 8 175
# 9 5
# Name: count, dtype: int64
df['grade'] = np.where(df['quality'].ge(7), 1, 0)
df = df.drop(columns=['free sulfur dioxide', 'quality'])
hds.plot.bar_freq(data=df, x='grade')
hds.plot.corr_heatmap(data=df, fontsize=7)
[-1]for var_name in df.columns.to_list()[:-1]:
hds.plot.box_group(data=df, x='grade', y=var_name, palette=['skyblue', 'orange'])
plt.show()
yvar = 'grade'
X = df.drop(columns=yvar)
y = df[yvar].copy()
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0)
X_train.shape
# (3428, 10)
X_valid.shape
# (1470, 10)
y_train.value_counts(normalize=True).sort_index()
# grade
# 0 0.782089
# 1 0.217911
# Name: proportion, dtype: float64
y_valid.value_counts(normalize=True).sort_index()
# grade
# 0 0.787075
# 1 0.212925
# Name: proportion, dtype: float64
objs = {
'X_train': X_train,
'X_valid': X_valid,
'y_train': y_train,
'y_valid': y_valid
}
pd.to_pickle(obj=objs, filepath_or_buffer='WhiteWine.pkl')
globals().update(objs)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
from sklearn.neighbors import KNeighborsClassifier
model_unif = KNeighborsClassifier()
model_unif.fit(X=X_train, y=y_train)
model_unif.score(X=X_train, y=y_train)
# 0.8865227537922987
model_unif.score(X=X_valid, y=y_valid)
# 0.8061224489795918
distances, indices = model_unif.kneighbors(X=X_valid)
distances[0]
# array([0.84149839, 1.18088628, 1.25421004, 1.25455938, 1.25766403])
indices[0]
# array([1833, 1041, 1351, 3203, 2240])
kth_distances = pd.Series(data=distances[:, -1])
kth_distances.describe().round(3)
# count 1470.000
# mean 1.369
# std 0.650
# min 0.000
# 25% 1.073
# 50% 1.269
# 75% 1.546
# max 17.231
# dtype: float64
kth_distances.sort_values().tail()
# 5 4.638065
# 625 4.843886
# 903 5.734853
# 1025 5.997965
# 190 17.231431
# dtype: float64
sns.histplot(x=kth_distances, binrange=(0, 6), binwidth=0.5)
plt.show()

model_dist = KNeighborsClassifier(weights='distance')
model_dist.fit(X_train, y_train)
model_dist.score(X=X_train, y=y_train)
# 1.0
model_dist.score(X=X_valid, y=y_valid)
# 0.8469387755102041
y_pred_unif = model_unif.predict(X_valid)
y_pred_dist = model_dist.predict(X_valid)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_unif)
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_dist)


def valid_score(k):
model = KNeighborsClassifier(n_neighbors=k, weights='distance')
model.fit(X=X_train, y=y_train)
score = model.score(X=X_valid, y=y_valid)
return score
ks = range(1, 100, 2)
vl_acc = [valid_score(k) for k in ks]
sns.lineplot(x=ks, y=vl_acc)
plt.show()

index = np.argmax(vl_acc)
best_k = ks[index]
model_best = KNeighborsClassifier(n_neighbors=best_k, weights='distance')
model_best.fit(X=X_train, y=y_train)
model_best.score(X=X_train, y=y_train)
# 1.0
model_best.score(X=X_valid, y=y_valid)
# 0.8727891156462585
from sklearn.metrics import f1_score
def valid_f1_score(k):
model = KNeighborsClassifier(n_neighbors=k, weights='distance')
model.fit(X=X_train, y=y_train)
y_pred = model.predict(X=X_valid)
score = f1_score(y_true=y_valid, y_pred=y_pred)
return score
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=5, random_state=0)
X_bal, y_bal = smote.fit_resample(X=X_train, y=y_train)
y_bal.value_counts(normalize=True)
# grade
# 0 0.5
# 1 0.5
# Name: proportion, dtype: float64
model_bal = KNeighborsClassifier(n_neighbors=best_k, weights='distance')
model_bal.fit(X=X_bal, y=y_bal)
model_bal.score(X=X_bal, y=y_bal)
# 1.0
model_bal.score(X=X_valid, y=y_valid)
# 0.7952380952380952
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer()
df_imp = pd.DataFrame(data=imputer.fit_transform(X=df), columns=df.columns)
df_imp['Insulin'] = df_imp['Insulin'].clip(lower=16)
cond1 = df_imp['SkinThickness'].lt(80)
cond2 = df_imp['BMI'].lt(60)
df_imp = df_imp.loc[cond1 & cond2, :].drop(columns='Outcome')
df_imp = df_imp.reset_index(drop=True)
from sklearn.neighbors import KNeighborsRegressor
model_unif = KNeighborsRegressor(p=1)
model_unif.fit(X=X_train, y=y_train)
model_unif.score(X=X_train, y=y_train)
# 0.5783751948120422
model_unif.score(X=X_valid, y=y_valid)
# 0.34874094754106255
distances, indices = model_unif.kneighbors(X=X_valid)
distances[0]
# array([1.57368314, 2.08625259, 2.10585654, 2.23487783, 2.30612248])
kth_distance = pd.Series(data=distances[:, -1])
sns.histplot(x=kth_distance, binrange=(1, 7), binwidth=0.5)
plt.show()

model_dist = KNeighborsRegressor(weights='distance', p=1)
model_dist.fit(X=X_train, y=y_train)
model_dist.score(X=X_train, y=y_train)
# 1.0
model_dist.score(X=X_valid, y=y_valid)
# 0.35088449132245114
y_pred_unif = model_unif.predict(X=X_valid)
y_pred_dist = model_dist.predict(X=X_valid)
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_unif)
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_dist)


def valid_score(k):
model = KNeighborsRegressor(n_neighbors=k, weights='distance', p=1)
model.fit(X_train, y_train)
score = model.score(X=X_valid, y=y_valid)
return score
ks = range(1, 100)
vl_rsq = [valid_score(k) for k in ks]
index = np.argmax(vl_rsq)
best_k = ks[index]
sns.lineplot(x=ks, y=vl_rsq)
plt.show()

model_best = KNeighborsRegressor(n_neighbors=best_k, p=1, weights='distance')
model_best.fit(X=X_train, y=y_train)
model_best.score(X=X_train, y=y_train)
# 1.0
model_best.score(X=X_valid, y=y_valid)
# 0.36093215506286147
내일은 의사결정나무부터 시작되는 머신러닝의 꽃 랜덤포레스트 및 앙상블에 대한 내용을 배운다. 머신러닝에 대해 배우면 배울수록 데이터 전처리와 이론 지식을 더 철저하게 복습하고 공부해야겠다는 생각이 들었다.