import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df = pd.read_csv("https://raw.githubusercontent.com/wikibook/machine-learning/2.0/data/csv/basketball_stat.csv")
print(df.head())
print(df.Pos.value_counts())
sns.lmplot(x='STL', y='2P',
data=df,
fit_reg=False, # no line
scatter_kws={"s": 100}, # 좌표 상의 점의 크기
markers=["o", "x"],
hue="Pos") # 예측값
plt.title('STL and 2P in 2d plane')
sns.lmplot(x='AST', y='2P', data=df, fit_reg=False,
scatter_kws={'s': 100},
markers=['o', 'x'],
hue='Pos')
plt.title('AST and 2P in 2d plane')
sns.lmplot(x='BLK', y='3P', data=df, fit_reg=False,
scatter_kws={'s': 150},
markers=['o', 'x'],
hue='Pos')
plt.title('BLK and 3P in 2d plane')
sns.lmplot(x='TRB', y='3P', data=df, fit_reg=False,
scatter_kws={'s': 150},
markers=['o', 'x'],
hue='Pos')
plt.title('BLK and 3P in 2d plane')
df.drop(['2P', 'AST', 'STL'],
axis=1, # 열을 삭제
inplace=True)
print(df.head())
from sklearn.model_selection import train_test_split, cross_val_score
train, test = train_test_split(df, test_size=0.2) # 학습용 80%, 평가용 20%
print(train.shape[0], test.shape[0])
from sklearn.neighbors import KNeighborsClassifier
max_k_range = train.shape[0] // 2
k_list = []
for i in range(3, max_k_range, 2):
k_list.append(i)
print(k_list)
cross_validation_scores = []
x_train = train[['3P', 'BLK', 'TRB']]
y_train = train['Pos']
for k in k_list:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, x_train,
y_train.values.ravel(), # 1차원 평면화 메소드
cv=10, # 교차 검증 폴드 수
scoring='accuracy')
cross_validation_scores.append(scores.mean())
cross_validation_scores
plt.plot(k_list, cross_validation_scores)
plt.xlabel('the number of k')
plt.ylabel('Accuracy')
plt.show
k = k_list[cross_validation_scores.index(max(cross_validation_scores))]
print(f'The best number of k: {k}')
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors = k)
x_train = train[['3P', 'BLK', 'TRB']]
y_train = train[['Pos']]
knn.fit(x_train, y_train.values.ravel())
x_test = test[['3P', 'BLK', 'TRB']]
y_test = test[['Pos']]
pred = knn.predict(x_test)
print(f'accuracy: {str(accuracy_score(y_test.values.ravel(), pred))}')
comparison = pd.DataFrame({'prediction':pred, 'ground-truth':y_test.values.ravel()})
comparison
구글 colab