[R] 고객 서비스 이탈 예측

: ) YOUNG·2022년 6월 14일

R 빅데이터분석기사 빅분기

빅분기

목록 보기

15/20

고객 서비스 이탈 예측하기

Exited 컬럼의 값을 예측

library

library(dplyr)
library(caret)
library(ModelMetrics)
library(randomForest)

데이터 호출


x_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv")
x_test <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv")
y_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv")

데이터 확인

nrow(x_train)
nrow(x_test)
nrow(y_train)

> nrow(x_train)
[1] 6499
> nrow(x_test)
[1] 3501
> nrow(y_train)
[1] 6499

summary(x_train)
summary(y_train)

> summary(x_train)
   CustomerId         Surname           CreditScore     Geography
 Min.   :15565701   Length:6499        Min.   :350.0   Length:6499
 1st Qu.:15629493   Class :character   1st Qu.:584.0   Class :character
 Median :15691808   Mode  :character   Median :651.0   Mode  :character  
 Mean   :15691574                      Mean   :650.4
 3rd Qu.:15753578                      3rd Qu.:718.0
 Max.   :15815660                      Max.   :850.0
    Gender               Age            Tenure          Balance
 Length:6499        Min.   :18.00   Min.   : 0.000   Min.   :     0
 Class :character   1st Qu.:32.00   1st Qu.: 3.000   1st Qu.:     0
 Mode  :character   Median :37.00   Median : 5.000   Median : 97560
                    Mean   :38.96   Mean   : 5.042   Mean   : 76837
                    3rd Qu.:44.00   3rd Qu.: 8.000   3rd Qu.:127845  
                    Max.   :92.00   Max.   :10.000   Max.   :238388
 NumOfProducts    HasCrCard      IsActiveMember   EstimatedSalary
 Min.   :1.00   Min.   :0.0000   Min.   :0.0000   Min.   :    11.58
 1st Qu.:1.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 50907.57
 Median :1.00   Median :1.0000   Median :1.0000   Median :100496.84
 Mean   :1.52   Mean   :0.7089   Mean   :0.5144   Mean   :100346.56
 3rd Qu.:2.00   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:150480.16
 Max.   :4.00   Max.   :1.0000   Max.   :1.0000   Max.   :199970.74



> summary(y_train)
   CustomerId           Exited
 Min.   :15565701   Min.   :0.0000
 1st Qu.:15629493   1st Qu.:0.0000
 Median :15691808   Median :0.0000
 Mean   :15691574   Mean   :0.2037
 3rd Qu.:15753578   3rd Qu.:0.0000
 Max.   :15815660   Max.   :1.0000

결측값 확인

colSums(is.na(x_train))
colSums(is.na(x_test))
colSums(is.na(y_train))

> colSums(is.na(x_train))
     CustomerId         Surname     CreditScore       Geography          Gender
              0               0               0               0               0
            Age          Tenure         Balance   NumOfProducts       HasCrCard
              0               0               0               0               0
 IsActiveMember EstimatedSalary
              0               0
              
              
> colSums(is.na(x_test))
     CustomerId     CreditScore       Geography          Gender             Age
              0               0               0               0               0
         Tenure         Balance   NumOfProducts       HasCrCard  IsActiveMember
              0               0               0               0               0
EstimatedSalary
              0
              
              
> colSums(is.na(y_train))
CustomerId     Exited
         0          0

결측값 없음

데이터 전처리 및 형변환

편의를 위해서 x_train 데이터와 y_train데이터를 하나로 묶음


full <- merge(x_train, y_train, by = "CustomerId")

Surname, CustomerId 컬럼은 불필요하므로 제거

full <- subset(full, select = -c(Surname, CustomerId))

factor형으로 변환이 필요한 컬럼들은 변환

full$Gender <- as.factor(full$Gender)
full$Geography <- as.factor(full$Geography)
full$Exited <- as.factor(as.character(full$Exited))
full$HasCrCard <- as.factor(as.character(full$HasCrCard))
full$IsActiveMember <- as.factor(as.character(full$IsActiveMember))
full$NumOfProducts <- as.factor(as.character(full$NumOfProducts))

x_test 데이터도 똑같은 작업 진행

x_test의 CustomerId는 마지막 결과 출력에서 필요하기 때문에 남겨둠
x_test에는 Exited컬럼은 없기 때문에 넘어감,

x_test <- subset(x_test, select = -c(Surname))
x_test$Gender <- as.factor(x_test$Gender)
x_test$Geography <- as.factor(x_test$Geography)
x_test$HasCrCard <- as.factor(as.character(x_test$HasCrCard))
x_test$IsActiveMember <- as.factor(as.character(x_test$IsActiveMember))
x_test$NumOfProducts <- as.factor(as.character(x_test$NumOfProducts))

데이터 스케일링

Min-Max 방법을 사용해서 데이터들을 스케일링 하는 작업

model <- preProcess(
    full,
    method = c('range')
)
full <- predict(model, full)
summary(full)

> summary(full)
  CreditScore       Geography       Gender          Age
 Min.   :0.0000   France :3227    male :  80   Min.   :0.0000
 1st Qu.:0.4680   Germany:1650   female:  73   1st Qu.:0.1892
 Median :0.6020   Spain  :1622   Female:2861   Median :0.2568  
 Mean   :0.6008                  Male  :3485   Mean   :0.2832
 3rd Qu.:0.7360                                3rd Qu.:0.3514
 Max.   :1.0000                                Max.   :1.0000
     Tenure          Balance       NumOfProducts HasCrCard IsActiveMember
 Min.   :0.0000   Min.   :0.0000   1:3360        0:1892    0:3156
 1st Qu.:0.3000   1st Qu.:0.0000   2:2939        1:4607    1:3343
 Median :0.5000   Median :0.4093   3: 161
 Mean   :0.5042   Mean   :0.3223   4:  39
 3rd Qu.:0.8000   3rd Qu.:0.5363
 Max.   :1.0000   Max.   :1.0000
 EstimatedSalary  Exited
 Min.   :0.0000   0:5175
 1st Qu.:0.2545   1:1324
 Median :0.5025
 Mean   :0.5018
 3rd Qu.:0.7525
 Max.   :1.0000


model <- preProcess(
    x_test[, c(-1)],
    method = c('range')
)
x_test <- predict(model, x_test)
summary(x_test)

> summary(x_test)
   CustomerId        CreditScore       Geography       Gender
 Min.   :15565714   Min.   :0.0000   France :1787    male :  53
 1st Qu.:15626475   1st Qu.:0.4367   Germany: 859   female:  47
 Median :15689168   Median :0.5823   Spain  : 855   Female:1562  
 Mean   :15689765   Mean   :0.5797                  Male  :1839
 3rd Qu.:15752601   3rd Qu.:0.7194
 Max.   :15815690   Max.   :1.0000
      Age             Tenure          Balance       NumOfProducts HasCrCard        
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   1:1724        0:1053
 1st Qu.:0.2090   1st Qu.:0.2000   1st Qu.:0.0000   2:1651        1:2448
 Median :0.2836   Median :0.5000   Median :0.3834   3: 105
 Mean   :0.3113   Mean   :0.4959   Mean   :0.3023   4:  21
 3rd Qu.:0.3881   3rd Qu.:0.7000   3rd Qu.:0.5073
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000
 IsActiveMember EstimatedSalary
 0:1693         Min.   :0.0000
 1:1808         1st Qu.:0.2558
                Median :0.4986
                Mean   :0.4979
                3rd Qu.:0.7320
                Max.   :1.0000

수치형 데이터들은 모두 0 ~ 1 사이의 값으로 변환이 된 것을 확인

랜덤포레스트를 통한 예측


rf <- randomForest(
    Exited ~ .,
    data = full,
    do.trace = TRUE,
    ntree = 400
)

pred <- predict(
    object = rf,
    newdata = x_test,
    type = 'class'
)

head(pred, 40)

> head(pred, 40)
 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26      
 1  1  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 
27 28 29 30 31 32 33 34 35 36 37 38 39 40
 0  0  0  0  0  0  0  0  0  0  0  0  0  1
Levels: 0 1

결과 출력


result <- data.frame(
    x_test$CustomerId,
    pred
)
names(result) <- c("CustomerId", "Exited")
head(result)

> head(result)
  CustomerId Exited
1   15601012      1
2   15734762      1
3   15586757      0
4   15590888      0
5   15726087      0
6   15769504      0


write.csv(result, "result.csv", row.names = F)
Rtest <- read.csv("result.csv")
head(Rtest)
nrow(Rtest)

> head(Rtest)
  CustomerId Exited
1   15601012      1
2   15734762      1
3   15586757      0
4   15590888      0
5   15726087      0
6   15769504      0

> nrow(Rtest)
[1] 3501

코드

library(dplyr)
library(caret)
library(ModelMetrics)
library(randomForest)

x_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv")
x_test <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv")
y_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv")


nrow(x_train)
nrow(x_test)
nrow(y_train)
# Exited 컬럼을 예측
head(y_train$Exited, 100)


summary(x_train)
summary(y_train)
colSums(is.na(x_train))
colSums(is.na(x_test))
colSums(is.na(y_train))

full <- merge(x_train, y_train, by = "CustomerId" )
nrow(full)
summary(full)
colSums(is.na(full))
str(full)

full <- subset(full, select = -c(Surname, CustomerId))
full$Gender <- as.factor(full$Gender)
full$Geography <- as.factor(full$Geography)
full$Exited <- as.factor(as.character(full$Exited))
full$HasCrCard <- as.factor(as.character(full$HasCrCard))
full$IsActiveMember <- as.factor(as.character(full$IsActiveMember))
full$NumOfProducts <- as.factor(as.character(full$NumOfProducts))


x_test <- subset(x_test, select = -c(Surname))
x_test$Gender <- as.factor(x_test$Gender)
x_test$Geography <- as.factor(x_test$Geography)
x_test$HasCrCard <- as.factor(as.character(x_test$HasCrCard))
x_test$IsActiveMember <- as.factor(as.character(x_test$IsActiveMember))
x_test$NumOfProducts <- as.factor(as.character(x_test$NumOfProducts))


model <- preProcess(
    full,
    method = c('range')
)
full <- predict(model, full)
summary(full)

model <- preProcess(
    x_test[, c(-1)],
    method = c('range')
)
x_test <- predict(model, x_test)
summary(x_test)


rf <- randomForest(
    Exited ~ .,
    data = full,
    do.trace = TRUE,
    ntree = 400
)

pred <- predict(
    object = rf,
    newdata = x_test,
    type = 'class'
)

head(pred, 40)

result <- data.frame(
    x_test$CustomerId,
    pred
)
names(result) <- c("CustomerId", "Exited")
head(result)

write.csv(result, "result.csv", row.names = F)
Rtest <- read.csv("result.csv")
head(Rtest)
nrow(Rtest)

: ) YOUNG

다음 포스트