Decision Tree로 유방암 환자분류

Minsu Kang·2025년 4월 29일
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

data = load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt = DecisionTreeClassifier(max_depth=4, random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("Decision Tree Acc:", accuracy_score(y_test, y_pred_dt))
print(confusion_matrix(y_test, y_pred_dt))

plt.figure(figsize=(12, 6))
plot_tree(dt, feature_names=data.feature_names, class_names=data.target_names, filled=True)
plt.show()

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Acc:", accuracy_score(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap="coolwarm", s=40)
plt.title("PCA Projection of Breast Cancer Data")
plt.show()

Visualization

Cross-validation으로 Overfitting 검사

아주 높은 Accuracy로 Overfitting 가능성 검사

from sklearn.model_selection import cross_val_score

scores = cross_val_score(dt, X, y, cv=5)
print("Cross-validation 평균:", scores.mean())
print("각 폴드 정확도:", scores)


물론 dataset이 작지만 다른 데이터에도 똑같이 적용할 수 있는 sample code가 되었다.

profile
안녕하세요! 강민수입니다.

0개의 댓글