## 유방암 데이터를 로드하여 랜덤포레스트로 수행
breast = pd.read_csv("./data/breast-cancer.csv")
breast["diagnosis"] = np.where(breast["diagnosis"]=="M", 1, 0)
features = ["area_mean", "texture_mean"]
X = breast[features]
y = breast["diagnosis"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size =0.3, stratify =y, random_state =1)
## RandomForestClassifier를 이용하여 score 생성
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators =100, min_samples_split =5)
pred=clf.fit(x_train, y_train).predict(x_test)
print("정확도 : ", clf.score(x_test, y_test))
정확도 : 0.8947368421052632
pred=clf.predict(x_test)
test_cm=confusion_matrix(y_test, pred)
test_acc=accuracy_score(y_test, pred)
test_prc=precision_score(y_test, pred)
test_rcll=recall_score(y_test, pred)
test_f1=f1_score(y_test, pred)
print(test_cm)
print('\n')
print('정확도\t{}%'.format(round(test_acc *100,2)))
print('정밀도\t{}%'.format(round(test_prc *100,2)))
print('재현율\t{}%'.format(round(test_rcll *100,2)))
[[102 5][ 13 51]]
정확도 89.47%
정밀도 91.07%
재현율 79.69%
## 변수 중요도 확인
importances = clf.feature_importances_
column_nm = pd.DataFrame(["area_mean", "texture_mean"])
feature_importances = pd.concat([column_nm,
pd.DataFrame(importances)],
axis=1)
feature_importances.columns = ['feature_nm', 'importances']
print(feature_importances)
feature_nm importances
0 area_mean 0.684699
1 texture_mean 0.315301