https://www.basketball-reference.com/leagues/NBA_2023_totals.html
3p, TRB, BRK, PTS를 이용하여 Pos를 예측
ROC를 통해 모델 성능 비교
코드 실습은 Jupyter에서 진행
# Awards 삭제
df.drop('Awards', axis=1, inplace = True )
# 결측치 해결
df.loc[:, '3P'] = df['3P'].fillna(df['3P'].mean())
df.loc[:, '3P%'] = df['3P%'].fillna(df['3P%'].mean())
df.loc[:, 'TRB'] = df['TRB'].fillna(df['TRB'].mean())
df.loc[:, 'BLK'] = df['BLK'].fillna(df['BLK'].mean())
df.loc[:, '2P%'] = df['2P%'].fillna(df['2P%'].mean())
df.loc[:, 'FT%'] = df['FT%'].fillna(df['FT%'].mean())
one_hot_encoder = OneHotEncoder(sparse_output=False)
df['Pos'] = one_hot_encoder.fit_transform(df[['Pos']])
test_df = df
# 데이터 분리
train, test = train_test_split( test_df ,test_size = 0.3)
train_data = train[['3P','BLK','TRB', '2P%', 'FT%']] # 3p, BLK, TRB
train_label = train['Pos'] # pos
test_data = test[['3P','BLK','TRB', '2P%', 'FT%']]
test_label = test['Pos']
dt = DecisionTreeClassifier()
lr = LogisticRegression(max_iter=1000)
svc = SVC(probability=True)
rm_forest = RandomForestClassifier()
svc.fit(train_data, train_label)
lr.fit(train_data, train_label)
dt.fit(train_data, train_label)
rm_forest.fit(train_data, train_label)
y_pred_dt = dt.predict_proba(test_data)
y_pred_svc = svc.predict_proba(test_data)
y_pred_lr = lr.predict_proba(test_data)
y_pred_rf = rm_forest.predict_proba(test_data)
fpr_dt, tpr_dt, z = roc_curve(test_label, y_pred_dt[:, 1])
fpr_svc, tpr_svc, z = roc_curve(test_label, y_pred_svc[:, 1])
fpr_lr, tpr_lr, z = roc_curve(test_label, y_pred_lr[:, 1])
fpr_rf, tpr_rf, z = roc_curve(test_label, y_pred_rf[:, 1])
plt.plot(fpr_dt, tpr_dt, color='r', label = f'DT.AUC : {round(roc_auc_score(test_label, y_pred_dt[:, 1]),2)}')
plt.plot(fpr_svc, tpr_svc, color='g', label = f'SVC.AUC : {round(roc_auc_score(test_label, y_pred_svc[:, 1]),2)}')
plt.plot(fpr_lr, tpr_lr, color='b', label = f'LR.AUC : {round(roc_auc_score(test_label, y_pred_lr[:, 1]),2)}')
plt.plot(fpr_rf, tpr_rf, color='y', label = f'RF.AUC : {round(roc_auc_score(test_label, y_pred_rf[:, 1]),2)}')
plt.plot([0,1], [0,1], 'k--')
plt.title('dt, svc, lr, rf 모델 Roc커브 그래프')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()