KNN

PKH·2023년 6월 30일
0

into_fintech

목록 보기
11/11
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.read_csv("https://raw.githubusercontent.com/wikibook/machine-learning/2.0/data/csv/basketball_stat.csv")
print(df.head())

print(df.Pos.value_counts())

sns.lmplot(x='STL', y='2P',
           data=df,
           fit_reg=False,  # no line
           scatter_kws={"s": 100},  # 좌표 상의 점의 크기
           markers=["o", "x"],
           hue="Pos")  # 예측값
plt.title('STL and 2P in 2d plane')

sns.lmplot(x='AST', y='2P', data=df, fit_reg=False,
           scatter_kws={'s': 100},
           markers=['o', 'x'],
           hue='Pos')
plt.title('AST and 2P in 2d plane')

sns.lmplot(x='BLK', y='3P', data=df, fit_reg=False,
           scatter_kws={'s': 150},
           markers=['o', 'x'],
           hue='Pos')
plt.title('BLK and 3P in 2d plane')

sns.lmplot(x='TRB', y='3P', data=df, fit_reg=False,
           scatter_kws={'s': 150},
           markers=['o', 'x'],
           hue='Pos')
plt.title('BLK and 3P in 2d plane')

df.drop(['2P', 'AST', 'STL'],
        axis=1,  # 열을 삭제
        inplace=True)

print(df.head())

from sklearn.model_selection import train_test_split, cross_val_score

train, test = train_test_split(df, test_size=0.2)  # 학습용 80%, 평가용 20%
print(train.shape[0], test.shape[0])

from sklearn.neighbors import KNeighborsClassifier

max_k_range = train.shape[0] // 2
k_list = []
for i in range(3, max_k_range, 2):
  k_list.append(i)

print(k_list)

cross_validation_scores = []
x_train = train[['3P', 'BLK', 'TRB']]
y_train = train['Pos']

for k in k_list:
  knn = KNeighborsClassifier(n_neighbors=k)
  scores = cross_val_score(knn, x_train, 
                           y_train.values.ravel(), # 1차원 평면화 메소드
                           cv=10,  # 교차 검증 폴드 수
                           scoring='accuracy')
  cross_validation_scores.append(scores.mean())

cross_validation_scores

plt.plot(k_list, cross_validation_scores)
plt.xlabel('the number of k')
plt.ylabel('Accuracy')
plt.show

k = k_list[cross_validation_scores.index(max(cross_validation_scores))]
print(f'The best number of k: {k}')

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors = k)

x_train = train[['3P', 'BLK', 'TRB']]
y_train = train[['Pos']]

knn.fit(x_train, y_train.values.ravel())

x_test = test[['3P', 'BLK', 'TRB']]
y_test = test[['Pos']]

pred = knn.predict(x_test)

print(f'accuracy: {str(accuracy_score(y_test.values.ravel(), pred))}')

comparison = pd.DataFrame({'prediction':pred, 'ground-truth':y_test.values.ravel()})
comparison

구글 colab

0개의 댓글