24.10.23 Day69

최지원·2024년 10월 23일

데이터 준비 방법

  • 번호, 성별, 나이, 소득
  • 요약하고 집계 후 데이터를 넣어야 한다. ex) 총 주문수, 총 주문액

정밀도 & 재현율

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, roc_auc_score

pred = clf.predict(X_test)
pred_proba = clf.predict_proba(X_test)[:, -1]
print(pred_proba)

print('정확도: {}'.format(accuracy_score(y_test, pred)))

print('혼돈행렬')
print(confusion_matrix(y_test, pred))

print('정밀도: {}'.format(precision_score(y_test, pred)))
print('재현율: {}'.format(recall_score(y_test, pred)))
print('auc score: {}'.format(roc_auc_score(y_test, pred_proba)))

트리

  • 하이퍼파라미터
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
clf = DecisionTreeClassifier(max_depth=2,         # 트리의 최대 깊이
                             min_samples_split=2, # 자식 노드를 분할하는데 필요한 최소 샘플 수
                             min_samples_leaf=10,  # 리프 노드에 있어야 할 최소 샘플 수
                             max_features=None,   # 최대 x 열의 개수
                             random_state=121,    # 난수 seed 설정
                             max_leaf_nodes=None  # 최대 리프 노드의 개수
                             )

iris_data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris_data.data,
                                                    iris_data.target,
                                                    test_size=0.2,
                                                    random_state=11)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(accuracy_score(y_test,pred))
  • GridSearchCv 하이퍼파라미터
from sklearn.model_selection import GridSearchCV

clf = DecisionTreeClassifier(random_state=156)

param_grid = dict()
param_grid['max_depth'] = [None, 5, 10, 20, 30]
param_grid['min_samples_split'] = [2, 5, 10]
param_grid['min_samples_leaf'] = [1, 2, 4]

grid_search = GridSearchCV(estimator=clf,
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy')

grid_search.fit(X_train, y_train)

print('최적의 파라미터:', grid_search.best_params_)
print('최적의 학습 모델의 정확도:', grid_search.best_score_)
best_clf = grid_search.best_estimator_
pred = best_clf.predict(X_test)

print('정확도', accuracy_score(y_test, pred))
from sklearn.tree import export_graphviz
export_graphviz(best_clf, 
                out_file='tree.dot',
class_names=iris_data.target_names,
feature_names=iris_data.feature_names,
                impurity=True,
                filled=True)
import graphviz

with open('tree.dot') as f:
    dot_graph = f.read()

graphviz.Source(dot_graph)

best_clf.feature_importances_
for name, value in zip(iris_data.feature_names, best_clf.feature_importances_):
    print(name, value)
from sklearn.datasets import make_moons
# 데이터 생성
X, y = make_moons(n_samples=300, noise=0.25, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 결정 트리 모델 학습
tree_deep = DecisionTreeClassifier(max_depth=None, random_state=42)
tree_shallow = DecisionTreeClassifier(max_depth=5, random_state=42)
tree_deep.fit(X_train, y_train)
tree_shallow.fit(X_train, y_train)
# 학습 데이터에 대한 정확도
y_train_pred_overfit, y_train_pred_regularized  = tree_deep.predict(X_train), tree_shallow.predict(X_train)
train_accuracy_overfit = accuracy_score(y_train, y_train_pred_overfit)
train_accuracy_regularized = accuracy_score(y_train, y_train_pred_regularized)
# 테스트 데이터에 대한 정확도
y_test_pred_overfit, y_test_pred_regularized = tree_deep.predict(X_test), tree_shallow.predict(X_test)
test_accuracy_overfit = accuracy_score(y_test, y_test_pred_overfit)
test_accuracy_regularized = accuracy_score(y_test, y_test_pred_regularized)
# 결과 출력
print(f"Overfitting Model - Training Accuracy: {train_accuracy_overfit:.4f}")
print(f"Overfitting Model - Test Accuracy: {test_accuracy_overfit:.4f}")
print(f"Regularized Model - Training Accuracy: {train_accuracy_regularized:.4f}")
print(f"Regularized Model - Test Accuracy: {test_accuracy_regularized:.4f}")

HyperOpt

  • 최적화
from hyperopt import hp
# 목적함수를 정의(최소값을 알아냄)
def objective_func(search_space):
    # 트래닝하고 정확도의 음수를 반환
    x = search_space['x']
    y = search_space['y']
    return x ** 2 - 20 * y
search_space = dict()
search_space['x'] = hp.quniform('x', -10, 10, 1)
search_space['y'] = hp.quniform('y', -15, 15, 1)
from hyperopt import fmin, tpe, Trials
trial = Trials()
best_param = fmin(fn=objective_func, space=search_space, algo=tpe.suggest,
                  max_evals=20, trials=trial)
print(best_param)
# evals=5,  {'x': np.float64(-9.0), 'y': np.float64(10.0)}

0개의 댓글