Exited 컬럼의 값을 예측
library(dplyr)
library(caret)
library(ModelMetrics)
library(randomForest)
x_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv")
x_test <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv")
y_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv")
nrow(x_train)
nrow(x_test)
nrow(y_train)
> nrow(x_train)
[1] 6499
> nrow(x_test)
[1] 3501
> nrow(y_train)
[1] 6499
summary(x_train)
summary(y_train)
> summary(x_train)
CustomerId Surname CreditScore Geography
Min. :15565701 Length:6499 Min. :350.0 Length:6499
1st Qu.:15629493 Class :character 1st Qu.:584.0 Class :character
Median :15691808 Mode :character Median :651.0 Mode :character
Mean :15691574 Mean :650.4
3rd Qu.:15753578 3rd Qu.:718.0
Max. :15815660 Max. :850.0
Gender Age Tenure Balance
Length:6499 Min. :18.00 Min. : 0.000 Min. : 0
Class :character 1st Qu.:32.00 1st Qu.: 3.000 1st Qu.: 0
Mode :character Median :37.00 Median : 5.000 Median : 97560
Mean :38.96 Mean : 5.042 Mean : 76837
3rd Qu.:44.00 3rd Qu.: 8.000 3rd Qu.:127845
Max. :92.00 Max. :10.000 Max. :238388
NumOfProducts HasCrCard IsActiveMember EstimatedSalary
Min. :1.00 Min. :0.0000 Min. :0.0000 Min. : 11.58
1st Qu.:1.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 50907.57
Median :1.00 Median :1.0000 Median :1.0000 Median :100496.84
Mean :1.52 Mean :0.7089 Mean :0.5144 Mean :100346.56
3rd Qu.:2.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:150480.16
Max. :4.00 Max. :1.0000 Max. :1.0000 Max. :199970.74
> summary(y_train)
CustomerId Exited
Min. :15565701 Min. :0.0000
1st Qu.:15629493 1st Qu.:0.0000
Median :15691808 Median :0.0000
Mean :15691574 Mean :0.2037
3rd Qu.:15753578 3rd Qu.:0.0000
Max. :15815660 Max. :1.0000
결측값 확인
colSums(is.na(x_train))
colSums(is.na(x_test))
colSums(is.na(y_train))
> colSums(is.na(x_train))
CustomerId Surname CreditScore Geography Gender
0 0 0 0 0
Age Tenure Balance NumOfProducts HasCrCard
0 0 0 0 0
IsActiveMember EstimatedSalary
0 0
> colSums(is.na(x_test))
CustomerId CreditScore Geography Gender Age
0 0 0 0 0
Tenure Balance NumOfProducts HasCrCard IsActiveMember
0 0 0 0 0
EstimatedSalary
0
> colSums(is.na(y_train))
CustomerId Exited
0 0
결측값 없음
편의를 위해서 x_train 데이터와 y_train데이터를 하나로 묶음
full <- merge(x_train, y_train, by = "CustomerId")
Surname, CustomerId 컬럼은 불필요하므로 제거
full <- subset(full, select = -c(Surname, CustomerId))
factor형으로 변환이 필요한 컬럼들은 변환
full$Gender <- as.factor(full$Gender)
full$Geography <- as.factor(full$Geography)
full$Exited <- as.factor(as.character(full$Exited))
full$HasCrCard <- as.factor(as.character(full$HasCrCard))
full$IsActiveMember <- as.factor(as.character(full$IsActiveMember))
full$NumOfProducts <- as.factor(as.character(full$NumOfProducts))
x_test 데이터도 똑같은 작업 진행
x_test의 CustomerId는 마지막 결과 출력에서 필요하기 때문에 남겨둠
x_test에는 Exited컬럼은 없기 때문에 넘어감,
x_test <- subset(x_test, select = -c(Surname))
x_test$Gender <- as.factor(x_test$Gender)
x_test$Geography <- as.factor(x_test$Geography)
x_test$HasCrCard <- as.factor(as.character(x_test$HasCrCard))
x_test$IsActiveMember <- as.factor(as.character(x_test$IsActiveMember))
x_test$NumOfProducts <- as.factor(as.character(x_test$NumOfProducts))
Min-Max 방법을 사용해서 데이터들을 스케일링 하는 작업
model <- preProcess(
full,
method = c('range')
)
full <- predict(model, full)
summary(full)
> summary(full)
CreditScore Geography Gender Age
Min. :0.0000 France :3227 male : 80 Min. :0.0000
1st Qu.:0.4680 Germany:1650 female: 73 1st Qu.:0.1892
Median :0.6020 Spain :1622 Female:2861 Median :0.2568
Mean :0.6008 Male :3485 Mean :0.2832
3rd Qu.:0.7360 3rd Qu.:0.3514
Max. :1.0000 Max. :1.0000
Tenure Balance NumOfProducts HasCrCard IsActiveMember
Min. :0.0000 Min. :0.0000 1:3360 0:1892 0:3156
1st Qu.:0.3000 1st Qu.:0.0000 2:2939 1:4607 1:3343
Median :0.5000 Median :0.4093 3: 161
Mean :0.5042 Mean :0.3223 4: 39
3rd Qu.:0.8000 3rd Qu.:0.5363
Max. :1.0000 Max. :1.0000
EstimatedSalary Exited
Min. :0.0000 0:5175
1st Qu.:0.2545 1:1324
Median :0.5025
Mean :0.5018
3rd Qu.:0.7525
Max. :1.0000
model <- preProcess(
x_test[, c(-1)],
method = c('range')
)
x_test <- predict(model, x_test)
summary(x_test)
> summary(x_test)
CustomerId CreditScore Geography Gender
Min. :15565714 Min. :0.0000 France :1787 male : 53
1st Qu.:15626475 1st Qu.:0.4367 Germany: 859 female: 47
Median :15689168 Median :0.5823 Spain : 855 Female:1562
Mean :15689765 Mean :0.5797 Male :1839
3rd Qu.:15752601 3rd Qu.:0.7194
Max. :15815690 Max. :1.0000
Age Tenure Balance NumOfProducts HasCrCard
Min. :0.0000 Min. :0.0000 Min. :0.0000 1:1724 0:1053
1st Qu.:0.2090 1st Qu.:0.2000 1st Qu.:0.0000 2:1651 1:2448
Median :0.2836 Median :0.5000 Median :0.3834 3: 105
Mean :0.3113 Mean :0.4959 Mean :0.3023 4: 21
3rd Qu.:0.3881 3rd Qu.:0.7000 3rd Qu.:0.5073
Max. :1.0000 Max. :1.0000 Max. :1.0000
IsActiveMember EstimatedSalary
0:1693 Min. :0.0000
1:1808 1st Qu.:0.2558
Median :0.4986
Mean :0.4979
3rd Qu.:0.7320
Max. :1.0000
수치형 데이터들은 모두 0 ~ 1 사이의 값으로 변환이 된 것을 확인
rf <- randomForest(
Exited ~ .,
data = full,
do.trace = TRUE,
ntree = 400
)
pred <- predict(
object = rf,
newdata = x_test,
type = 'class'
)
head(pred, 40)
> head(pred, 40)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
27 28 29 30 31 32 33 34 35 36 37 38 39 40
0 0 0 0 0 0 0 0 0 0 0 0 0 1
Levels: 0 1
result <- data.frame(
x_test$CustomerId,
pred
)
names(result) <- c("CustomerId", "Exited")
head(result)
> head(result)
CustomerId Exited
1 15601012 1
2 15734762 1
3 15586757 0
4 15590888 0
5 15726087 0
6 15769504 0
write.csv(result, "result.csv", row.names = F)
Rtest <- read.csv("result.csv")
head(Rtest)
nrow(Rtest)
> head(Rtest)
CustomerId Exited
1 15601012 1
2 15734762 1
3 15586757 0
4 15590888 0
5 15726087 0
6 15769504 0
> nrow(Rtest)
[1] 3501
library(dplyr)
library(caret)
library(ModelMetrics)
library(randomForest)
x_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv")
x_test <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv")
y_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv")
nrow(x_train)
nrow(x_test)
nrow(y_train)
# Exited 컬럼을 예측
head(y_train$Exited, 100)
summary(x_train)
summary(y_train)
colSums(is.na(x_train))
colSums(is.na(x_test))
colSums(is.na(y_train))
full <- merge(x_train, y_train, by = "CustomerId" )
nrow(full)
summary(full)
colSums(is.na(full))
str(full)
full <- subset(full, select = -c(Surname, CustomerId))
full$Gender <- as.factor(full$Gender)
full$Geography <- as.factor(full$Geography)
full$Exited <- as.factor(as.character(full$Exited))
full$HasCrCard <- as.factor(as.character(full$HasCrCard))
full$IsActiveMember <- as.factor(as.character(full$IsActiveMember))
full$NumOfProducts <- as.factor(as.character(full$NumOfProducts))
x_test <- subset(x_test, select = -c(Surname))
x_test$Gender <- as.factor(x_test$Gender)
x_test$Geography <- as.factor(x_test$Geography)
x_test$HasCrCard <- as.factor(as.character(x_test$HasCrCard))
x_test$IsActiveMember <- as.factor(as.character(x_test$IsActiveMember))
x_test$NumOfProducts <- as.factor(as.character(x_test$NumOfProducts))
model <- preProcess(
full,
method = c('range')
)
full <- predict(model, full)
summary(full)
model <- preProcess(
x_test[, c(-1)],
method = c('range')
)
x_test <- predict(model, x_test)
summary(x_test)
rf <- randomForest(
Exited ~ .,
data = full,
do.trace = TRUE,
ntree = 400
)
pred <- predict(
object = rf,
newdata = x_test,
type = 'class'
)
head(pred, 40)
result <- data.frame(
x_test$CustomerId,
pred
)
names(result) <- c("CustomerId", "Exited")
head(result)
write.csv(result, "result.csv", row.names = F)
Rtest <- read.csv("result.csv")
head(Rtest)
nrow(Rtest)