Classification(분류): 다양한 머신러닝 알고리즘을 활용하여 데이터를 특정 클래스로 분류하는 작업
머신러닝 모델의 성능을 평가하기 위한 다양한 지표
| 실제값 (Actual Values) | |
|---|---|
| Positive (암 O) | |
| 예측값 (Predicted Values) | Positive (암 O) |
| Negative (암 X) | FN (False Negative, Type II Error) |
각 용어의 의미
의미
// 실습 코드
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
iris = load_iris()
iris
// 실습 코드
print(iris.keys())
print(iris.target_names)
print(iris.feature_names)
print(iris.target)
// 실습 코드
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
// 실습 코드
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
// 실습 코드
for i in (1, 3, 5, 7): # k의 갯수
for j in ('uniform', 'distance'): # 거리계산
for k in ('auto', 'ball_tree', 'kd_tree', 'brute'): # algorithm
model = KNeighborsClassifier(n_neighbors=i, weights=j, algorithm=k)
# 하이퍼파라미터를 최적화
model.fit(X_train, y_train)
y_p = model.predict(X_test)
relation_square = model.score(X_test, y_test)
from sklearn.metrics import confusion_matrix, classification_report
knn_matrix = confusion_matrix(y_test, y_p) # 혼동행렬, 오분류표
print(knn_matrix) # 대각선 부분만 봄 => support 수치와 동일
target_names = ['setosa', 'versicolor', 'virginica']
knn_result = classification_report(y_test, y_p, target_names=target_names)
print(knn_result)
# macro avg : 라벨별 F1-Score를 산술 평균한 것
# weighted avg : 라벨별 F1-Score를 샘플수(support)의 비중에 따라 가중 평균한 것
print('\n')
print('\n')
print('accuracy : {:.2f}'.format(knn.score(X_test, y_test)))
// 실습 코드
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print(type(cancer))
print(dir(cancer))
# <class 'sklearn.utils._bunch.Bunch'>
# ['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'frame', 'target', 'target_names']
// 실습 코드
print(cancer.data.shape)
print(cancer.feature_names)
print(cancer.target_names) # malignant: 악성 / benign: 양성
print(cancer.target)
print(np.bincount(cancer.target)) # 빈도수
print(cancer.DESCR)
// 실습 코드
for i, name in enumerate(cancer.feature_names):
print('%02d : %s' %(i, name))
print('data =>', cancer.data.shape)
print('target =>', cancer.target.shape)
malignant = cancer.data[cancer.target==0]
benign = cancer.data[cancer.target==1]
print('malignant(악성) =>', malignant.shape)
print('benign(양성) =>', benign.shape)
// 실습 코드
_, bins=np.histogram(cancer.data[:, 0], bins=20)
print(_)
np.histogram(cancer.data[:, 0], bins=20)
plt.hist(malignant[:, 0], bins=bins, alpha=0.3) # alpha: 투명도
plt.hist(benign[:, 0], bins=bins, alpha=0.3)
plt.title(cancer.feature_names[0])

// 실습 코드
plt.figure(figsize=[20, 15])
for col in range(30):
plt.subplot(8, 4, col+1)
_, bins=np.histogram(cancer.data[:, 0], bins=20)
plt.hist(malignant[:, 0], bins=bins, alpha=0.3) # alpha: 투명도
plt.hist(benign[:, 0], bins=bins, alpha=0.3)
plt.title(cancer.feature_names[col])
if col==0: plt.legend(cancer.target_names)
plt.xticks([])

// 실습 코드
from sklearn.linear_model import LogisticRegression
scores = []
# iteration인 for문을 왜 돌렸는가? 비슷한 방법은 무엇일까?
# => 데이터 분할에 따른 성능 변화를 확인하기 위해, 비슷한 방법은 K-Fold Cross Validation
# 하지만 train_test_split를 해서 매번 랜덤하게 훈련/테스트 데이터가 분할하는데
# => 한번이라도 테스트데이터가 곂칠 수 도있으므로 좋은 방법은 아님
for i in range(10):
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2)
model = LogisticRegression(max_iter = 5000)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
scores.append(score)
print('scores =', scores)
# scores = [0.9473684210526315, 0.9736842105263158, 0.9824561403508771, 0.9122807017543859, 0.9473684210526315, 0.9473684210526315, 0.9473684210526315, 0.9736842105263158, 0.9649122807017544, 0.9736842105263158]
// 실습 코드
fig = plt.figure(figsize=[14, 14])
fig.suptitle('Breast Cancer - feature analysis', fontsize=20)
for col in range(cancer.feature_names.shape[0]): # 30 features
plt.subplot(8, 4, col+1)
_, bins=np.histogram(cancer.data[:, col], bins=50)
plt.hist(malignant[:, col], bins=bins, alpha=0.5, label='malignant', color='red')
plt.hist(benign[:, col], bins=bins, alpha=0.5, label='benign', color='green')
plt.title(cancer.feature_names[col]+('(%d)' %col))
plt.xticks([])
plt.yticks([])
if col==0: plt.legend()

// 실습 코드
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
scores.append(score)
print(moel.coef_) # 회귀계수
print(model.intercept_) # 절편
print(model.predict_proba(X_test)) # 예측(0, 1) 확률
// 실습 코드
import pandas as pd
mushroom = pd.read_csv('/content/drive/MyDrive/LG헬로비전_DX_School/250210/Mushroom Classification/mushrooms.csv')
mushroom
// 실습 코드
print(mushroom.info()) # 모든 column이 object -> encoder가 필요함.
print(mushroom['class'].unique()) # 중복을 제거한 것을 보자 # p, e
// 실습 코드
from sklearn.preprocessing import LabelEncoder
mush_encoded = mushroom.copy()
le = LabelEncoder()
for col in mush_encoded.columns:
mush_encoded[col] = le.fit_transform(mush_encoded[col])
print(mush_encoded.head(10))
print(mush_encoded.columns)
// 실습 코드
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 10))
sns.heatmap(mush_encoded.corr(), cmap='inferno', square=True)
plt.show()

// 실습 코드
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'x-large',
'axes.labelsize': 'x-large',
'axes.titlesize': 'x-large',
'xtick.labelsize': 'x-large',
'ytick.labelsize': 'x-large'}
pylab.rcParams.update(params)
// 실습 코드
def plot_col(col, hue=None, color=['blue', 'purple'], labels=None):
fig, ax = plt.subplots(figsize=(15, 7))
sns.countplot(x=col, hue=hue, palette=color, saturation=0.6, data=mush_encoded, dodge=True, ax=ax)
ax.set(title=f"Mushroom {col.title()} Quantity", xlabel=f"{col.title()}", ylabel="Quantity")
if labels != None:
ax.set_xticklabels(labels)
if hue != None:
ax.legend(('Poisonous', 'Edible'), loc=0)
class_dict = ('Poisonous', 'Edible')
plot_col(col='class', labels=class_dict)
plt.show()

// 실습 코드
#Visualizing the number of mushrooms for each of the available cap sizes
shape_dict = {"bell":"b","conical":"c","convex":"x","flat":"f", "knobbed":"k","sunken":"s"}
labels = ('convex', 'bell', 'sunken', 'flat', 'knobbed', 'conical')
plot_col(col='cap-shape', hue='class', labels=labels)
plt.show()

// 실습 코드
import plotly.graph_objects as go
labels = ['Edible', 'Poison']
values = mush_encoded['class'].value_counts()
fig=go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
marker=dict(colors=['#87CEFA', '#7FFF00'],
line=dict(color='#FFFFFF',width=3)))
fig.show()

// 실습 코드
#Plot to understand the habitat of different mushrooms
labels = ['Woods', 'Grasses', 'Paths', 'Leaves', 'Urban', 'Meadows', 'Waste']
values = mush_encoded['habitat'].value_counts()
colors = ['#DEB887','#778899', '#B22222', '#FFFF00',
'#F8F8FF','#FFE4C4','#FF69B4']
fig=go.Figure(data=[go.Pie(labels=labels,
values=values,
#marker_colors=labels,
pull=[0.1, 0, 0, 0, 0.2, 0, 0])])
fig.update_traces(title='Mushrooms Habitat Percentage',
hoverinfo='label+value',
textinfo='percent',
opacity=0.9,
textfont_size=20,
marker=dict(colors=colors,
line=dict(color='#000000', width=0.1)),
)
fig.show()

// 실습 코드
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np
y = mush_encoded['class'].values
x = mush_encoded.drop(['class'], axis=1)
X_train, X_test, y_train,y_test = train_test_split(x, y, test_size=0.25, random_state=42)
from sklearn.linear_model import LogisticRegression
model_LR = LogisticRegression(solver='lbfgs', max_iter=1000)
model_LR.fit(X_train, y_train)
y_prob = model_LR.predict_proba(X_test)[:,1] # 0하고 1(target) -> target -> 1
y_pred = np.where(y_prob > 0.5, 1, 0) # 과적합을 유도함. target -> 1의 확율이 0.5 큰 경우는 1로 부여하고, 아닌 것은 0으로 부여한것.
# 강제로 0.5를 기준으로 잡힌 것은 아닌지?
# 기준값을 정한 것 자체가 상관은 없음.
model_LR.score(X_test, y_pred)
// 실습 코드
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report
confusion_matrix1 = confusion_matrix(y_test, y_pred)
confusion_matrix1
// 실습 코드
classification_report1 = classification_report(y_test, y_pred)
print(classification_report1)
// 실습 코드
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)
plt.figure(figsize=(10, 10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, color='red', label='AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

// 실습 코드
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np
y = mush_encoded['class'].values
x = mush_encoded.drop(['class'], axis=1)
X_train, X_test, y_train,y_test = train_test_split(x, y, test_size=0.25, random_state=42)
from sklearn.linear_model import LogisticRegression
tuned_parameters = {'C':[0.001, 0.01, 0.1, 1, 10 , 100, 1000], 'penalty':['l1', 'l2']}
model_LR = LogisticRegression(solver='lbfgs', max_iter=1000)
from sklearn.model_selection import GridSearchCV
LR = GridSearchCV(model_LR, tuned_parameters, cv=10)
LR.fit(X_train, y_train)
y_prob = LR.predict_proba(X_test)[:, 1]
y_pred = np.where(y_prob > 0.5, 1, 0)
// 실습 코드
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)
plt.figure(figsize=(10, 10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, color='red', label='AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
