[이번에 사용한 데이터] : HAR_dataset
import pandas as pd
url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/features.txt'
# '\s+' 공백, header 그대로, 컬럼 이름 names
feature_name_df = pd.read_csv(url, sep='\s+', header=None, names=['columns_index','columns_name'])
# 밸류만 가지고 feature_name 추출 -> 즉, 앞으로 561개의 이름만 저장하게 됨
feature_name = feature_name_df.iloc[:, 1].values.tolist()
X_train_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/X_train.txt'
X_test_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/X_test.txt'
X_train = pd.read_csv(X_train_url, sep='\s+', header=None)
X_test = pd.read_csv(X_test_url, sep='\s+', header=None)
X_train.columns = feature_name
X_test.columns = feature_name
y_train_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/y_train.txt'
y_test_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/y_test.txt'
y_train = pd.read_csv(y_train_url, sep='\s+', header=None, names=['action'])
y_test = pd.read_csv(y_test_url, sep='\s+', header=None, names=['action'])
GBM - Gradient Boosting Machine
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time
import warnings
warnings.filterwarnings('ignore')
# GradientBoostingClassifier
start_time = time.time()
gb_clf = GradientBoostingClassifier(random_state=13)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)
print('ACC : ', accuracy_score(y_test, gb_pred))
print('Fit time : ', time.time() - start_time)
다른 분들은 40분만에 결과를 볼 수 있었다고 했는데..
난 저녁 약속을 다녀와도 계속 running 중이라 멈출 수 밖에 없었다....🙄
# GridSearch로 조금 더 찾아보자~
from sklearn.model_selection import GridSearchCV
params = {
'n_estimators' : [100,500], 'learning_rate' : [0.05, 0.1]
}
start_time = time.time()
grid = GridSearchCV(gb_clf, param_grid=params, cv=2, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)
print('Fit time : ', time.time() - start_time)
# test 성능
accuracy_score(y_test, grid.best_estimator_.predict(X_test))
# test 성능
accuracy_score(y_test, grid.best_estimator_.predict(X_test))
!pip install xgboost
from xgboost import XGBClassifier
start_time = time.time()
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
# numpy array 값을 받아들이기 때문에, values 값만 넣어야 한다.
xgb.fit(X_train.values, y_train)
print('Fit time : ', time.time() - start_time)
# 289.586 나옴
accuracy_score(y_test, grid.best_estimator_.predict(X_test.values))
#0.9392 나옴
# 조기 종료 설정 (early_stopping_round)
from xgboost import XGBClassifier
evals = [(X_test.values, y_test)]
start_time = time.time()
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
# numpy array 값을 받아들이기 때문에, values 값만 넣어야 한다.
# early_stopping_rounds=10 : 같은 성능으로 10번 이상 비슷한 값이 나오면 종료 해라
xgb.fit(X_train.values, y_train, early_stopping_rounds=10, eval_set=evals)
print('Fit time : ', time.time() - start_time)
!pip install lightgbm
start_time = time.time()
from lightgbm import LGBMClassifier
import time
evals = [(X_test.values, y_test)]
start_time = time.time()
lgbm = LGBMClassifier(n_estimators=400)
lgbm.fit(X_train.values, y_train,eval_set=evals)
print('Fit time : ', time.time() - start_time)