









# 필요 module import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
# Raw Data Loading
iris = load_iris()
# DataFrame으로 변환해서 처리하는게 쉽고 편해요!
df = pd.DataFrame(iris.data,
columns=iris.feature_names)
df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
df['target'] = iris.target
# display(df)
# 결측치와 이상치는 없다고 가정하고 진행!
# 중복데이터 처리
df = df.drop_duplicates()
# 이제 x_data와 t_data를 추출하면 될 거 같아요!
# x_data는 4개의 feature
x_data = df.drop(['target'],
axis=1,
inplace=False).values
t_data = df['target'].values
# 데이터 분리보다 정규화를 먼저 진행하는게 조금 더 편해요!
scaler = MinMaxScaler()
scaler.fit(x_data)
x_data_norm = scaler.transform(x_data)
# 데이터 분리
x_data_train_norm, x_data_test_norm, t_data_train, t_data_test = \
train_test_split(x_data_norm,
t_data,
test_size=0.3,
stratify=t_data,
random_state=0)
# KNN 구현
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_data_train_norm,
t_data_train)
knn_acc = accuracy_score(t_data_test, knn.predict(x_data_test_norm))
print(f'KNN 모델의 정확도 : {knn_acc}')
# SVM 구현
svm = SVC(kernel='linear',
C=0.5,
probability=True)
svm.fit(x_data_train_norm,
t_data_train)
svm_acc = accuracy_score(t_data_test, svm.predict(x_data_test_norm))
print(f'SVM 모델의 정확도 : {svm_acc}')
# DT 구현
dt = DecisionTreeClassifier()
dt.fit(x_data_train_norm,
t_data_train)
dt_acc = accuracy_score(t_data_test, dt.predict(x_data_test_norm))
print(f'DT 모델의 정확도 : {dt_acc}')

# 앙상블 모델을 만들어요!
# hard voting classifier(hvc)
hvc = VotingClassifier(estimators=[('KNN',knn),
('SVM',svm),
('DT',dt)],
voting='hard')
hvc.fit(x_data_train_norm, t_data_train)
hvc_acc = accuracy_score(t_data_test, hvc.predict(x_data_test_norm))
print(f'앙상블 모델(hard voting)의 accuracy : {hvc_acc}')
# soft voting classifier(svc)
svc = VotingClassifier(estimators=[('KNN',knn),
('SVM',svm),
('DT',dt)],
voting='soft')
svc.fit(x_data_train_norm, t_data_train)
svc_acc = accuracy_score(t_data_test, svc.predict(x_data_test_norm))
print(f'앙상블 모델(soft voting)의 accuracy : {svc_acc}')

%reset
# 앙상블 Bagging을 구현해 보아요!
# Decision Tree를 모아서 만든 Random Forest를 구현
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Raw Data Loading
iris = load_iris()
df = pd.DataFrame(iris.data,
columns = iris.feature_names)
df.columns = ['sepal_height','sepal_width','petal_height','petal_width']
df['target'] = iris.target
# 결측치, 이상치는 없다고 가정
# 중복 데이터 정리
df = df.drop_duplicates()
# 데이터셋 준비
x_data = df.drop('target',axis=1,inplace=False).values
t_data = df['target'].values
# 정규화
scaler = MinMaxScaler()
scaler.fit(x_data)
x_data_norm = scaler.transform(x_data)
# 데이터 분리
x_data_train_norm, x_data_test_norm, t_data_train, t_data_test = \
train_test_split(x_data_norm,
t_data,
stratify=t_data,
test_size=0.3,
random_state=0)
# DT 구현
dt = DecisionTreeClassifier()
dt.fit(x_data_train_norm,
t_data_train)
dt_acc = accuracy_score(t_data_test, dt.predict(x_data_test_norm))
print(f'DT 모델의 정확도 : {dt_acc}')
# Random Forest 구현
# n_estimators = decision tree의 개수
# max_depth = 트리의 높이 지정
rcf = RandomForestClassifier(n_estimators=50,
max_depth=3,
random_state=20)
rcf.fit(x_data_train_norm, t_data_train)
rcf_acc = accuracy_score(t_data_test, rcf.predict(x_data_test_norm))
print(f'RandomForest 모델의 accuracy : {rcf_acc}')

# 앙상블 boost
%reset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
# Raw Data Loading
iris = load_iris()
df = pd.DataFrame(iris.data,
columns = iris.feature_names)
df.columns = ['sepal_height','sepal_width','petal_height','petal_width']
df['target'] = iris.target
# 결측치, 이상치는 없다고 가정
# 중복 데이터 정리
df = df.drop_duplicates()
# 데이터셋 준비
x_data = df.drop('target',axis=1,inplace=False).values
t_data = df['target'].values
# 정규화
scaler = MinMaxScaler()
scaler.fit(x_data)
x_data_norm = scaler.transform(x_data)
# 데이터 분리
x_data_train_norm, x_data_test_norm, t_data_train, t_data_test = \
train_test_split(x_data_norm,
t_data,
stratify=t_data,
test_size=0.3,
random_state=0)
xgb = XGBClassifier(n_estimators=50,
max_depth=3,
random_state=20)
xgb.fit(x_data_train_norm, t_data_train)
xgb_acc = accuracy_score(t_data_test, xgb.predict(x_data_test_norm))
print(f'XGB 모델의 정확도 : {xgb_acc}')
