estimator
APIimport numpy as np import matplotlib.pyplot as plt plt.style.use('seaborn-whitegrid') #<ipython-input-2-66a077f50fee>:3: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead. #plt.style.use('seaborn-whitegrid')
x = 10 * np.random.rand(50) y = 2 * x + np.random.rand(50) plt.scatter(x,y);
# 1. 적절한 estimator 클래스를 임포트해서 모델의 클래스 선택 from sklearn.linear_model import LinearRegression # LinearRegression을 import
# 2. 클래스를 원하는 값으로 인스턴스화해서 모델의 하이퍼파라미터를 선택 model = LinearRegression(fit_intercept=True) #default도 True model # copy_X = True(입력 데이터를 복사 or not), fit_intercept(상수 형태 값)=True, n_jobs(데이터가 크면 CPU를 병렬로 처리) = None, normalize(정규) = False
# 3. 데이터를 특정 배열과 대상 벡터로 배치 X = x[:, np.newaxis] X
# 4. 모델의 인스턴스의 fit() 메서드를 호출해서 모델을 데이터에 적합 model.fit(X, y) #현재 상태를 보여줌
model.coef_ #array([2.01102898])
model.intercept_ #0.43656412446259196
#5. 모델을 새 데이터에 대해서 적용 xfit = np.linspace(-1, 11) Xfit = xfit[:, np.newaxis] yfit = model.predict(Xfit)
plt.scatter(x,y) plt.plot(xfit, yfit, '--r');
from sklearn.datasets import load_diabetes diabetes = load_diabetes() print(diabetes.keys()) #dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])
print(diabetes.data)
print(diabetes.target)
print(diabetes.DESCR)
print(diabetes.feature_names) #['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
print(diabetes.data_filename) print(diabetes.target_filename) #diabetes_data_raw.csv.gz #diabetes_target.csv.gz
train_test_split()
: 학습/테스트 데이터 세트 분리from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.datasets import load_diabetes # diabetes = load_diabetes() X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size =0.3) # model = LinearRegression() model.fit(X_train, y_train) # print("학습 데이터 점수 : {}".format(model.score(X_train, y_train))) print("평가 데이터 점수 : {}".format(model.score(X_test, y_test))) #학습 데이터 점수 : 0.5093006553809074 #평가 데이터 점수 : 0.5195840698115756
import matplotlib.pyplot as plt # predicted = model.predict(X_test) expected = y_test plt.figure(figsize = (8,4)) plt.scatter(expected, predicted) plt.plot([30,350], [30,350], '--r') plt.tight_layout()
cross_val_score()
: 교차 검증from sklearn.model_selection import cross_val_score, cross_validate scores = cross_val_score(model, diabetes.data, diabetes.target, cv =5) print("교차 검증 정확도 : {}".format(scores)) print("교차 검증 정확도 : {} +/- {}".format(np.mean(scores), np.std(scores))) #교차 검증 정확도 : [0.42955615 0.52259939 0.48268054 0.42649776 0.55024834] #교차 검증 정확도 : 0.48231643590864215 +/- 0.04926857751190387
GridSearchCV
: 교차 검증과 최적 하이퍼 파라미터 찾기from sklearn.model_selection import GridSearchCV from sklearn.linear_model import Ridge import pandas as pd alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000] param_grid = dict(alpha=alpha) # gs = GridSearchCV(estimator=Ridge(), param_grid=param_grid, cv=10) result = gs.fit(diabetes.data, diabetes.target) # print("최적 점수 : {}".format(result.best_score_)) print("최적 파라미터 : {}".format(result.best_params_)) print(gs.best_estimator_) pd.DataFrame(result.cv_results_)
multiprocessing
을 이용한 GridSearchCVimport multiprocessing from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV # iris = load_iris() # param_grid = [{ 'penalty' : ['l1', 'l2'], 'C': [0.5, 1.0, 1.5, 1.8, 2.0, 2.4]}] # gs = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, scoring = 'accuracy', cv=10, n_jobs=multiprocessing.cpu_count()) result = gs.fit(iris.data, iris.target) # print("최적 점수 : {}".format(result.best_score_)) print("최적 파라미터 : {}".format(result.best_params_)) print(gs.best_estimator_) pd.DataFrame(result.cv_results_)
preprocessing
데이터 전처리 모듈StandardScaler
: 표준화 클래스iris = load_iris() iris_df = pd.DataFrame(data= iris.data, columns=iris.feature_names) iris_df.describe()
from sklearn.preprocessing import StandardScaler # scaler = StandardScaler() iris_scaled = scaler.fit_transform(iris_df) iris_df_scaled = pd.DataFrame(data=iris_scaled, columns = iris.feature_names) iris_df_scaled.describe()
X_train, X_test, y_train, y_test = train_test_split(iris_df_scaled, iris.target, test_size=0.3) model = LogisticRegression() model.fit(X_train, y_train) # print("훈련 데이터 점수 : {}".format(model.score(X_train, y_train))) print("평가 데이터 점수 : {}".format(model.score(X_test, y_test))) #훈련 데이터 점수 : 0.9809523809523809 #평가 데이터 점수 : 0.9333333333333333
MinMaxScaler
: 정규화 클래스from sklearn.preprocessing import MinMaxScaler # scaler = MinMaxScaler() iris_scaled = scaler.fit_transform(iris_df) iris_df_scaled = pd.DataFrame(data=iris_scaled, columns=iris.feature_names) iris_df_scaled.describe()
accuracy_score
함수를 제공from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # X,y = make_classification(n_samples = 1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1) # X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3) # model = LogisticRegression() model.fit(X_train, y_train) # print("훈련 데이터 점수 : {}".format(model.score(X_train, y_train))) print("평가 데이터 점수 : {}".format(model.score(X_test, y_test))) # predict = model.predict(X_test) print("정확도 : {}".format(accuracy_score(y_test, predict))) #훈련 데이터 점수 : 0.8857142857142857 #평가 데이터 점수 : 0.9133333333333333 #정확도 : 0.9133333333333333
from sklearn.metrics import confusion_matrix # confmat = confusion_matrix(y_true=y_test, y_pred=predict) print(confmat) #[[131 10] # [ 16 143]]
fig, ax = plt.subplots(figsize=(2.5,2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]): ax.text(x=j, y=i, s=confmat[i,j], va='center', ha='center') # # plt.xlabel('Predicted label') plt.ylabel('True label') plt.tight_layout() plt.show()
정밀도 = TP / (FP + TP)
재현율 = TP / (FN + TP)
정확도 = (TN + TP) / (TN + FP + FN + TP)
오류율 = (FN + FP) / (TN + FP + FN + TP)
from sklearn.metrics import precision_score, recall_score # precision = precision_score(y_test, predict) recall = recall_score(y_test, predict) # print("정밀도 : {}".format(precision)) print("재현율 : {}".format(precision)) #정밀도 : 0.934640522875817 #재현율 : 0.934640522875817
F1 Score(F-measure)
from sklearn.metrics import f1_score # f1 = f1_score(y_test, predict) # print("F1 score : {}".format(f1)) #F1 score : 0.9166666666666666
ROC 곡선은 FPR(False Positive Rate)이 변할 때 TPR(True Positive Rate)이 어떻게 변하는지 나타내는 곡선
AUC(Area Under Curve) 값은 ROC 곡선 밑에 면적을 구한 값 (1이 가까울수록 좋은 값)
from sklearn.metrics import roc_curve # pred_proba_class1 = model.predict_proba(X_test)[:,1] fprs, tprs, thresholds = roc_curve(y_test, pred_proba_class1) # plt.plot(fprs, tprs, label='ROC') plt.plot([0,1], [0,1], '--k', label='Random') start, end = plt.xlim() plt.xticks(np.round(np.arange(start, end, 0.1), 2)) plt.xlim(0,1) plt.ylim(0,1) plt.xlabel("FPR(1-Sensitivity)") plt.ylabel("TPR(Recall)") plt.legend();
from sklearn.metrics import roc_auc_score # roc_auc = roc_auc_score(y_test, predict) # print("ROC AUC Score : {}".format(roc_auc)) #ROC AUC Score : 0.9142245416833935