# 데이터 로드
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
wine = pd.read_csv("./data/wine.csv")
X = wine.drop(columns="type")
y = wine["type"]
# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 모델 성능 출력용 공용함수 정의
def print_perf(estimator):
estimator.fit(X_train, y_train)
y_pred = estimator.predict(X_test)
print(classification_report(y_test, y_pred))
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
# 배깅
bg = BaggingClassifier(
DecisionTreeClassifier(random_state=42),
n_estimators=10,
random_state=42,
)
print_perf(bg)
## 출력 결과
# precision recall f1-score support
#
# 0 0.99 1.00 0.99 959
# 1 0.99 0.96 0.97 341
#
# accuracy 0.99 1300
# macro avg 0.99 0.98 0.98 1300
# weighted avg 0.99 0.99 0.99 1300
from sklearn.ensemble import RandomForestClassifier
# 랜덤 포레스트
rf = RandomForestClassifier(random_state=42)
print_perf(rf)
## 출력 결과
# precision recall f1-score support
#
# 0 0.99 1.00 1.00 959
# 1 1.00 0.98 0.99 341
#
# accuracy 0.99 1300
# macro avg 0.99 0.99 0.99 1300
# weighted avg 0.99 0.99 0.99 1300
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# 보팅(Soft)
vs = VotingClassifier(
[("LR", LogisticRegression(solver="liblinear", random_state=42)),
("KNN", KNeighborsClassifier()),
("DT", DecisionTreeClassifier(random_state=42))],
voting="soft"
)
print_perf(vs)
## 출력 결과
# precision recall f1-score support
#
# 0 0.99 1.00 0.99 959
# 1 0.99 0.96 0.98 341
#
# accuracy 0.99 1300
# macro avg 0.99 0.98 0.98 1300
# weighted avg 0.99 0.99 0.99 1300
# 보팅(Hard)
vh = VotingClassifier(
[("LR", LogisticRegression(solver="liblinear", random_state=42)),
("KNN", KNeighborsClassifier()),
("DT", DecisionTreeClassifier(random_state=42))],
voting="hard"
)
print_perf(vh)
## 출력 결과
#
# precision recall f1-score support
#
# 0 0.98 1.00 0.99 959
# 1 0.99 0.95 0.97 341
#
# accuracy 0.98 1300
# macro avg 0.98 0.97 0.98 1300
# weighted avg 0.98 0.98 0.98 1300
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(random_state=42, algorithm="SAMME", n_estimators=100)
print_perf(ada)
## 출력 결과
# precision recall f1-score support
#
# 0 0.99 1.00 0.99 959
# 1 0.99 0.98 0.98 341
#
# accuracy 0.99 1300
# macro avg 0.99 0.99 0.99 1300
# weighted avg 0.99 0.99 0.99 1300
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
stk = StackingClassifier(
estimators=[
("KNN", KNeighborsClassifier()),
("DT", DecisionTreeClassifier(random_state=42)),
],
final_estimator=LogisticRegression(solver="liblinear", random_state=42), # 메타 학습기
)
print_perf(stk)
## 출력 결과
# precision recall f1-score support
#
# 0 0.99 0.99 0.99 959
# 1 0.97 0.96 0.97 341
#
# accuracy 0.98 1300
# macro avg 0.98 0.98 0.98 1300
# weighted avg 0.98 0.98 0.98 1300
*이 글은 제로베이스 데이터 취업 스쿨의 강의 자료 일부를 발췌하여 작성되었습니다.