


유클리디안 거리(Euclidean_distance)
맨하탄 거리 (Manhattan distance)
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test)### K값 변화에 따른 성능 변화를 체크 from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score train_acc_list = [] test_acc_list = [] for k in range(1, 11): # 모델 생성 knn = KNeighborsClassifier(n_neighbors=k) #default: 유클리디안 디스턴스로 거리계산. # 학습 knn.fit(X_train_scaled, y_train) # knn.fit(X_train, y_train) # 검증 train_acc_list.append(accuracy_score(y_train, knn.predict(X_train_scaled))) test_acc_list.append(accuracy_score(y_test, knn.predict(X_test_scaled))) # train_acc_list.append(accuracy_score(y_train, knn.predict(X_train))) # test_acc_list.append(accuracy_score(y_test, knn.predict(X_test)))import pandas as pd df = pd.DataFrame({ "K":range(1, 11), "Train":train_acc_list, "Test":test_acc_list }) df.set_index("K", inplace=True) df # K가 작을 수록 모델의 복잡도가 높다. overfitting일 경우 K를 더 크게 잡아준다.
df.plot();
## Feature Scaling 하지 않은 X로 학습한 결과 import pandas as pd df = pd.DataFrame({ "K":range(1, 11), "Train":train_acc_list, "Test":test_acc_list }) df.set_index("K", inplace=True) df
import pandas as pd df = pd.read_csv("data/boston_hosing.csv") df.head()
X = df.drop(columns='MEDV').values y = df['MEDV'].valuesfrom sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)### GridSearchCV로 최적 K값, p값 찾기 from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsRegressor from sklearn.model_selection import GridSearchCV pipeline = Pipeline([ ("scaler", StandardScaler()), ("knn", KNeighborsRegressor()) ]) params = { "knn__n_neighbors":range(3, 10), "knn__p":[1, 2] } gs = GridSearchCV(pipeline, params, scoring="neg_mean_squared_error", cv=4, n_jobs=-1) gs.fit(X_train, y_train)
gs.best_params_{'knnn_neighbors': 3, 'knnp': 1}
-gs.best_score_18.73019870598482
df = pd.DataFrame(gs.cv_results_) df.sort_values('rank_test_score').head(5)
# 최종평가 from metrics import print_regression_metrics best_model = gs.best_estimator_ pred = best_model.predict(X_test) print_regression_metrics(y_test, pred, "최종평가")최종평가
MSE: 29.62185476815397
RMSE: 5.442596326033557
R Squared: 0.6374270288407293