๐ ์์๋ธ ํ์ต์ ์ ํ์ ๋ณดํ (Voting), ๋ฐฐ๊น (Bagging), ๋ถ์คํ (Boosting), ์คํํน(Stacking) ๋ฑ์ด ์กด์ฌํฉ๋๋ค.
์ด๋ฒ ํฌ์คํ ์์ ๋ค๋ฃฐ ์์๋ธ ๊ธฐ๋ฒ์ ๋ถ์คํ ์๊ณ ๋ฆฌ์ฆ์ด๋ฉฐ, ํด๋น ์๊ณ ๋ฆฌ์ฆ์ ์ํ๋ AdaBoost์ GBM์ ๋ํด ์์๋ณด๊ณ ์ ํฉ๋๋ค.
์ฐ์ ๋ถ์คํ ์ ์๋ฆฌ๋ถํฐ ์ฐจ๊ทผ์ฐจ๊ทผ ์์๋ด ์๋ค.
1) ์ฒซ ๋ฒ์งธ ์ฝํ ํ์ต๊ธฐ๊ฐ ์ฒซ ๋ฒ์งธ ๋ถ๋ฅ๊ธฐ์ค(D1)์ผ๋ก +์ -๋ฅผ ๋ถ๋ฅํด์ ์ค๋ฅ ๋ฐ์ดํฐ๋ฅผ ์์ถ
2) ์๋ชป ๋ถ๋ฅ๋ ๋ฐ์ดํฐ์ ๋ํด ๊ฐ์ค์น๋ฅผ ๋ถ์ฌ (์ปค์ง +ํ์)
3) ๋ ๋ฒ์งธ ์ฝํ ํ์ต๊ธฐ๊ฐ ๊ฐ์ค์น๋ฅผ ๋ฐ์ํ์ฌ ๋ ๋ฒ์งธ ๋ถ๋ฅ๊ธฐ์ค(D2)์ผ๋ก +์ -๋ฅผ ๋ค์ ๋ถ๋ฅํด์ ์ค๋ฅ ๋ฐ์ดํฐ๋ฅผ ์์ถ
4) ์๋ชป ๋ถ๋ฅ๋ ๋ฐ์ดํฐ์ ๋ํด ๊ฐ์ค์น๋ฅผ ๋ถ์ฌ (์ปค์ง -ํ์)
5) ์ธ ๋ฒ์งธ ์ฝํ ํ์ต๊ธฐ๊ฐ ๊ฐ์ค์น๋ฅผ ๋ฐ์ํ์ฌ ์ธ ๋ฒ์งธ ๋ถ๋ฅ๊ธฐ์ค(D3)์ผ๋ก +์ -๋ฅผ ๋ค์ ๋ถ๋ฅํด์ ์ค๋ฅ ๋ฐ์ดํฐ๋ฅผ ์์ถ
6) ๋ง์ง๋ง์ผ๋ก 3๊ฐ์ ๋ถ๋ฅ๊ธฐ์ค๋ณ๋ก ๊ณ์ฐ๋ ๋ชจ๋ธ์ ๊ฐ์ค์น๋ฅผ ๊ฐ๊ฐ ๋ถ์ฌํ๊ณ ๊ฐ์คํฉ์ผ๋ก ๊ฒฐํฉํ์ฌ ๋ถ๋ฅ
> library(caret)
> # ['ada'๊ฐ ํฌํจ๋ ๋ชจ๋ธ ํ์ธ]
> grep('ada', names(getModelInfo()), value = T, ignore.case = T)
[1] "ada" "AdaBag" "AdaBoost.M1" "adaboost" "mxnetAdam"
> # [adaboost์ ํ๋ผ๋ฏธํฐ ํ์ธ]
> modelLookup('adaboost')
model parameter label forReg forClass probModel
1 adaboost nIter #Trees FALSE TRUE TRUE
2 adaboost method Method FALSE TRUE TRUE
> # [์ต์ ์ ๋ชจ์๋ฅผ ์ฐพ๊ธฐ ์ํ ํ์ต๋ฐฉ๋ฒ ์ฌ์ ์ค์ ]
> control <- caret::trainControl(method = 'repeatedcv',
+ search = 'random',
+ ## ํ์ดํผํ๋ผ๋ฏธํฐ random search
+ number = 3,
+ repeats = 3,
+ allowParallel = T,
+ verboseIter = T
+ )
> # [๋ชจ๋ธ ํ์ต ๋ฐ ๊ฒ์ฆ]
> ada_model <- train(credit.rating ~., train,
+ method = "adaboost",
+ metric = 'Accuracy',
+ preProcess = c("zv", "center", "scale", "spatialSign"),
+ # tuneLength = 7,
+ trControl = control)
+ Fold1.Rep1: nIter=116, method=Adaboost.M1
- Fold1.Rep1: nIter=116, method=Adaboost.M1
+ Fold1.Rep1: nIter=506, method=Adaboost.M1 ...
> # [์ต์ ๋ชจ๋ธ ๋์ถ]
> ada_model
AdaBoost Classification Trees
700 samples
20 predictor
2 classes: 'pos', 'neg'
Pre-processing: centered (20), scaled (20), spatial sign
transformation (20)
Resampling: Cross-Validated (3 fold, repeated 3 times)
Summary of sample sizes: 466, 467, 467, 467, 467, 466, ...
Resampling results across tuning parameters:
nIter method Accuracy Kappa
25 Real adaboost 0.7242785 0.2222041
207 Adaboost.M1 0.7385724 0.3222722
672 Adaboost.M1 0.7419085 0.3356293
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were nIter = 672 and method
= Adaboost.M1.
> # [๋ณ์ ์ค์๋ ํ์ธ]
> plot(varImp(ada_model))
> # [ํผ๋ํ๋ ฌ]
> confusionMatrix(test$credit.rating, predict(ada_model, test, type = 'raw'))
Confusion Matrix and Statistics
Reference
Prediction pos neg
pos 39 51
neg 24 186
Accuracy : 0.75
95% CI : (0.697, 0.798)
No Information Rate : 0.79
P-Value [Acc > NIR] : 0.95956
Kappa : 0.349
Mcnemar's Test P-Value : 0.00268
Sensitivity : 0.6190
Specificity : 0.7848
Pos Pred Value : 0.4333
Neg Pred Value : 0.8857
Prevalence : 0.2100
Detection Rate : 0.1300
Detection Prevalence : 0.3000
Balanced Accuracy : 0.7019
'Positive' Class : pos
์์ adaBoost์ ๊ฒฝ์ฐ ์ค๋ถ๋ฅํ ๋ฐ์ดํฐ์ ๊ฐ์ค์น๋ฅผ ์ค์ผ๋ก์จ ๋ชจ๋ธ์ ๋ณด์ํ์ง๋ง, GBM์ ๊ฒฝ์ฐ ๊ฐ์ค์น ์ ๋ฐ์ดํธ๋ฅผ ๊ฒฝ์ฌํ๊ฐ๋ฒ(Gradient Descent)์ ์ด์ฉํ์ฌ ์ต์ ํ๋ ๊ฒฐ๊ณผ๋ฅผ ์ป๋ ์๊ณ ๋ฆฌ์ฆ์ ๋๋ค.
AdaBoost์ฒ๋ผ ๋ฐ๋ณต๋ง๋ค ์ํ์ ๊ฐ์ค์น๋ฅผ ์กฐ์ ํ๋ ๋์ ์ด์ ์์ธก๊ธฐ๊ฐ ๋ง๋ ์์ฌ ์ค์ฐจ(residual error)์ ์๋ก์ด ์์ธก๊ธฐ๋ฅผ ํ์ต์ํต๋๋ค.
์ด์ ๋ชจ๋ธ์ residual์ ๊ฐ์ง๊ณ weak learner๋ฅผ ๊ฐํํฉ๋๋ค. ์ฆ, residual์ ์์ธกํ๋ ํํ์ ๋ชจ๋ธ์ ๋๋ค.
> # [์ต์ ์ ๋ชจ์๋ฅผ ์ฐพ๊ธฐ ์ํ ํ์ต๋ฐฉ๋ฒ ์ฌ์ ์ค์ ]
> fitControl <- trainControl(method = "repeatedcv",
+ number = 3,
+ repeats = 3,
+ verboseIter = T)
> # [gbm์ ํ๋ผ๋ฏธํฐ ํ์ธ]
> modelLookup('gbm')
model parameter label forReg forClass
1 gbm n.trees # Boosting Iterations TRUE TRUE
2 gbm interaction.depth Max Tree Depth TRUE TRUE
3 gbm shrinkage Shrinkage TRUE TRUE
4 gbm n.minobsinnode Min. Terminal Node Size TRUE TRUE
probModel
1 TRUE
2 TRUE
3 TRUE
4 TRUE
> # [๊ทธ๋ฆฌ๋์์น๋ฅผ ์ํ ํ์ดํผํ๋ผ๋ฏธํฐ ์ค์ ]
> tunegrid2 <- expand.grid(n.trees = c(10, 20, 30, 40),
+ ## ์์ฑํ ๋๋ฌด์ ๊ฐ์
+ interaction.depth = c(1:10),
+ ## = maxdepth
+ shrinkage = c(0.1),
+ #=learning Rate
+ n.minobsinnode = c(10:50)
+ ## ๋ถํ ์ ์์ํ ๋
ธ๋์ ์ต์ ํ๋ จ ์ธํธ ์ํ ์
+ )
> # [๋ชจ๋ธ ํ์ต ๋ฐ ๊ฒ์ฆ]
> gbm_gridsearch2 <- train(credit.rating~.,
+ data = train,
+ method = 'gbm',
+ metric = ifelse(is.factor(train$credit.rating),'Accuracy','RMSE'),
+ tuneGrid = tunegrid2,
+ trControl = fitControl)
> # [ํ๋ผ๋ฏธํฐ์ ๋ฐ๋ฅธ accuracy ํ๋ฆ ํ์
]
> trellis.par.set(caretTheme())
> plot(gbm_gridsearch2)
> # [๋ณ์ ์ค์๋ ํ์ธ]
> summary(gbm_gridsearch2)
var rel.inf
account.balance account.balance 21.1839170
credit.amount credit.amount 19.3042177
credit.duration.months credit.duration.months 10.8977305
credit.purpose credit.purpose 6.9162193
age age 6.9126727
savings savings 5.8127936
previous.credit.payment.status previous.credit.payment.status 5.6003470
current.assets current.assets 5.5444213
installment.rate installment.rate 4.1435804
employment.duration employment.duration 3.6837592
marital.status marital.status 3.1228349
residence.duration residence.duration 1.9844654
apartment.type apartment.type 1.9317795
other.credits other.credits 1.2082644
dependents dependents 1.0179670
telephone telephone 0.4575234
occupation occupation 0.2775065
guarantor guarantor 0.0000000
bank.credits bank.credits 0.0000000
foreign.worker foreign.worker 0.0000000
> # [ํผ๋ํ๋ ฌ]
> confusionMatrix(test$credit.rating, predict(gbm_gridsearch2, test))
Confusion Matrix and Statistics
Reference
Prediction pos neg
pos 40 50
neg 23 187
Accuracy : 0.7567
95% CI : (0.704, 0.8041)
No Information Rate : 0.79
P-Value [Acc > NIR] : 0.929670
Kappa : 0.3663
Mcnemar's Test P-Value : 0.002342
Sensitivity : 0.6349
Specificity : 0.7890
Pos Pred Value : 0.4444
Neg Pred Value : 0.8905
Prevalence : 0.2100
Detection Rate : 0.1333
Detection Prevalence : 0.3000
Balanced Accuracy : 0.7120
'Positive' Class : pos