[R] 이직 여부 판단 예측

: ) YOUNG·2022년 6월 15일
1

빅분기

목록 보기
16/20
post-thumbnail

이직 여부 판단 예측


target 컬럼의 값을 예측

(target: 1: 이직 , 0 : 이직 x)



library

library(dplyr)
library(caret)
library(randomForest)
library(ModelMetrics)
library(tidyr)


데이터 호출


x_train  <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_train.csv", 
    stringsAsFactor = TRUE, 
    na.strings = c("", "na", "NA", NA))
y_train  <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/y_train.csv", 
    na.strings = c("", "na", "NA", NA))
x_test <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_test.csv", 
    stringsAsFactor = TRUE, 
    na.strings = c("", "na", "NA", NA))


데이터 확인

nrow(x_train)
nrow(y_train)
nrow(x_test)
> nrow(x_train)
[1] 12452
> nrow(y_train)
[1] 12452
> nrow(x_test)
[1] 6706

summary(x_train)
summary(y_train)

> summary(x_train)
  enrollee_id          city      city_development_index    gender
 Min.   :    1   city_103:2839   Min.   :0.4480         Female: 808
 1st Qu.: 8592   city_21 :1713   1st Qu.:0.7400         Male  :8595
 Median :17178   city_16 :1005   Median :0.9100         Other : 132  
 Mean   :16966   city_114: 854   Mean   :0.8297         NA's  :2917
 3rd Qu.:25328   city_160: 557   3rd Qu.:0.9200
 Max.   :33380   city_136: 374   Max.   :0.9490
                 (Other) :5110
              relevent_experience       enrolled_university
 Has relevent experience:8953     Full time course:2431
 No relevent experience :3499     no_enrollment   :8975
                                  Part time course: 789
                                  NA's            : 257





       education_level        major_discipline   experience      company_size
 Graduate      :7540   Arts           : 157    >20    :2179   50-99    :2000
 High School   :1336   Business Degree: 195    4      : 945   100-500  :1672
 Masters       :2796   Humanities     : 436    5      : 926   10000+   :1337
 Phd           : 269   No Major       : 136    3      : 858   10/49    : 947
 Primary School: 196   Other          : 248    6      : 795   1000-4999: 852
 NA's          : 315   STEM           :9414    (Other):6712   (Other)  :1792  
                       NA's           :1866    NA's   :  37   NA's     :3852
              company_type  last_new_job training_hours
 Early Stage Startup: 365   >4   :2182   Min.   :  1.0
 Funded Startup     : 652   1    :5157   1st Qu.: 23.0
 NGO                : 340   2    :1895   Median : 47.0
 Other              :  81   3    : 685   Mean   : 65.6
 Public Sector      : 620   4    : 676   3rd Qu.: 88.0
 Pvt Ltd            :6413   never:1584   Max.   :336.0
 NA's               :3981   NA's : 273
> summary(y_train)
  enrollee_id        target
 Min.   :    1   Min.   :0.0000
 1st Qu.: 8592   1st Qu.:0.0000
 Median :17178   Median :0.0000
 Mean   :16966   Mean   :0.2494
 3rd Qu.:25328   3rd Qu.:0.0000
 Max.   :33380   Max.   :1.0000


결측값 확인 및 처리 & 형 변환

colSums(is.na(x_train))
colSums(is.na(x_test))
colSums(is.na(y_train))
> colSums(is.na(x_train))
     CustomerId         Surname     CreditScore       Geography          Gender
              0               0               0               0               0
            Age          Tenure         Balance   NumOfProducts       HasCrCard
              0               0               0               0               0
 IsActiveMember EstimatedSalary
              0               0
              
              
> colSums(is.na(x_test))
     CustomerId     CreditScore       Geography          Gender             Age
              0               0               0               0               0
         Tenure         Balance   NumOfProducts       HasCrCard  IsActiveMember
              0               0               0               0               0
EstimatedSalary
              0
              
              
> colSums(is.na(y_train))
CustomerId     Exited
         0          0


full <- full %>% fill(education_level, .direction = 'updown')
full <- full %>% fill(company_type, .direction = 'updown')
full <- full %>% fill(company_size, .direction = 'updown')
full <- full %>% fill(major_discipline, .direction = 'updown')
full <- full %>% fill(last_new_job, .direction = 'updown')
full <- full %>% fill(enrolled_university, .direction = 'updown')
full <- full %>% fill(experience, .direction = 'updown')

full$gender[is.na(full$gender)] <- 'Male'
full$gender[full$gender == 'Other'] <- 'Male'
full$gender <- factor(full$gender, levels = c('Male', 'Female'))
table(full$gender)

full$relevent_experience <- ifelse(full$relevent_experience == 'Has relevent experience', 'Yes', 'No')
full$relevent_experience <- as.factor(full$relevent_experience)
full$relevent_experience <- relevel(full$relevent_experience, 'Yes') 

full$target <- ifelse(full$target == 1, 'Yes', 'No')
full$target <- as.factor(full$target)
full$target <- relevel(full$target, 'Yes') 


full$city <- as.character(full$city)
colSums(is.na(full))

explain

  • tidyr패키지의 fill 함수를 사용해서 결측값인 값들을 위 아래의 값으로 대체했음

  • geder 컬럼은 'Male'과 'Female'의 비율을 확인해 본 결과 Mail의 비율이 압도적으로 높았다는 것을 확인하고 'Other' 컬럼과 결측값은 모두 'Male'로 대체했음

  • relevent_experience 컬럼은 문장이 너무 길어서 'Yes'와 'No'로 그냥 깔끔하게 변환해줬음
  • target은 목표값인데, 1, 0이 아닌 'Yes', 'No'로 변환해줬음 1이 이직을 한다이므로 'Yes' 설정해줬음
  • city 컬럼은 factor형이었으나, levels 범위가 너무 넓어서 그냥 character형으로 바꿨음

위의 과정을 x_test 데이터에도 동일하게 적용


x_test <- x_test %>% fill(education_level, .direction = 'updown')
x_test <- x_test %>% fill(company_type, .direction = 'updown')
x_test <- x_test %>% fill(company_size, .direction = 'updown')
x_test <- x_test %>% fill(major_discipline, .direction = 'updown')
x_test <- x_test %>% fill(last_new_job, .direction = 'updown')
x_test <- x_test %>% fill(enrolled_university, .direction = 'updown')
x_test <- x_test %>% fill(experience, .direction = 'updown')


x_test$gender[is.na(x_test$gender)] <- 'Male'
x_test$gender[x_test$gender == 'Other'] <- 'Male'
x_test$gender <- factor(x_test$gender, levels = c('Male', 'Female'))
table(x_test$gender)

x_test$relevent_experience <- ifelse(x_test$relevent_experience == 'Has relevent experience', 'Yes', 'No')
x_test$relevent_experience <- as.factor(x_test$relevent_experience)
x_test$relevent_experience <- relevel(x_test$relevent_experience, 'Yes') 
x_test$city <- as.character(x_test$city)


변환 확인


> str(full)
'data.frame':   12452 obs. of  13 variables:
 $ city                  : chr  "city_103" "city_103" "city_67" "city_136" ...
 $ city_development_index: num  0.92 0.92 0.855 0.897 0.92 0.91 0.92 0.624 0.899 0.924 ...
 $ gender                : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 1 1 1 1 1 ...
 $ relevent_experience   : Factor w/ 2 levels "Yes","No": 2 2 1 1 2 1 1 1 1 1 ...
 $ enrolled_university   : Factor w/ 3 levels "Full time course",..: 2 2 2 1 2 2 3 2 3 2 ...
 $ education_level       : Factor w/ 5 levels "Graduate","High School",..: 2 4 3 3 1 1 1 1 1 1 ...
 $ major_discipline      : Factor w/ 6 levels "Arts","Business Degree",..: 6 6 6 6 3 6 6 6 6 6 ...
 $ experience            : Factor w/ 22 levels "<1",">20","1",..: 14 9 4 7 7 2 2 7 18 2 ...
 $ company_size          : Factor w/ 8 levels "<10","10/49",..: 5 5 1 3 3 4 4 8 6 4 ...
 $ company_type          : Factor w/ 6 levels "Early Stage Startup",..: 6 3 1 2 2 6 6 6 5 6 ...
 $ last_new_job          : Factor w/ 6 levels ">4","1","2","3",..: 6 4 2 2 3 1 1 5 1 2 ...
 $ training_hours        : int  150 128 12 18 12 2 78 38 46 126 ...
 $ target                : Factor w/ 2 levels "Yes","No": 2 1 2 2 2 1 2 2 2 2 ...



> str(x_test)
'data.frame':   6706 obs. of  13 variables:
 $ enrollee_id           : int  7129 31037 22179 29724 17977 30328 9446 13697 1184 23961 ...
 $ city                  : chr  "city_23" "city_44" "city_103" "city_50" ...
 $ city_development_index: num  0.899 0.725 0.92 0.896 0.689 0.624 0.848 0.698 0.926 0.92 ...
 $ gender                : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 1 1 1 1 1 ...
 $ relevent_experience   : Factor w/ 2 levels "Yes","No": 2 2 2 2 1 1 1 1 2 1 ...
 $ enrolled_university   : Factor w/ 3 levels "Full time course",..: 2 3 1 1 1 2 2 2 2 2 ...
 $ education_level       : Factor w/ 5 levels "Graduate","High School",..: 1 3 1 1 3 1 1 3 4 1 ...
 $ major_discipline      : Factor w/ 6 levels "Arts","Business Degree",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ experience            : Factor w/ 22 levels "<1",">20","1",..: 14 18 20 20 13 8 19 2 2 8 ...
 $ company_size          : Factor w/ 8 levels "<10","10/49",..: 6 6 6 6 6 6 6 1 5 5 ...
 $ company_type          : Factor w/ 6 levels "Early Stage Startup",..: 6 6 6 6 6 2 2 5 6 6 ...
 $ last_new_job          : Factor w/ 6 levels ">4","1","2","3",..: 2 6 3 6 2 2 2 6 1 2 ...
 $ training_hours        : int  23 39 262 78 125 94 22 55 31 46 ...




> colSums(is.na(full))
                  city city_development_index                 gender
                     0                      0                      0
   relevent_experience    enrolled_university        education_level
                     0                      0                      0 
      major_discipline             experience           company_size
                     0                      0                      0
          company_type           last_new_job         training_hours
                     0                      0                      0
                target
                     0
                     
                     
                     
> colSums(is.na(x_test))
           enrollee_id                   city city_development_index
                     0                      0                      0
                gender    relevent_experience    enrolled_university
                     0                      0                      0
       education_level       major_discipline             experience
                     0                      0                      0
          company_size           company_type           last_new_job
                     0                      0                      0
        training_hours
                     0




데이터 스케일링


해당 데이터는 스케일링이 굳이 필요하지 않다고 판단해서 넘어가기로 함



랜덤포레스트 🌳


rf <- randomForest(
    target ~ . ,
    full,
    ntree = 300,
    do.trace = TRUE
)

auc(rf)

pred <- predict(
    rf, 
    newdata = x_test,
)

head(pred, 10)

> head(pred, 10)
 1  2  3  4  5  6  7  8  9 10
No No No No No No No No No No
Levels: Yes No


결과 출력


list <- ifelse(pred == 'Yes', 1, 0)
list <- as.factor(list)
list

result <- data.frame(
    x_test$enrollee_id,
    list
)
> head(result)
  x_test.enrollee_id list
1               7129    0
2              31037    0
3              22179    0
4              29724    0
5              17977    0
6              30328    0



names(result) <- c("enrollee_id", 'target')
write.csv(result, "result.csv", row.names = F)

Rtest <- read.csv('result.csv')
head(Rtest)
> head(Rtest)
  enrollee_id target
1        7129      0
2       31037      0
3       22179      0
4       29724      0
5       17977      0
6       30328      0


코드

library(dplyr)
library(caret)
library(randomForest)
library(ModelMetrics)
library(tidyr)

x_train  <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_train.csv", 
    stringsAsFactor = TRUE, 
    na.strings = c("", "na", "NA", NA))
y_train  <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/y_train.csv", 
    na.strings = c("", "na", "NA", NA))
x_test <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_test.csv", 
    stringsAsFactor = TRUE, 
    na.strings = c("", "na", "NA", NA))


nrow(x_train)
nrow(y_train)
nrow(x_test)

summary(x_train)
summary(y_train)


# target 컬럼 예측하기
full <- merge(x_train, y_train, "enrollee_id")
full <- full[, -c(1)]



# full 결측값 제거
colSums(is.na(full))
full <- full %>% fill(education_level, .direction = 'updown')
full <- full %>% fill(company_type, .direction = 'updown')
full <- full %>% fill(company_size, .direction = 'updown')
full <- full %>% fill(major_discipline, .direction = 'updown')
full <- full %>% fill(last_new_job, .direction = 'updown')
full <- full %>% fill(enrolled_university, .direction = 'updown')
full <- full %>% fill(experience, .direction = 'updown')

full$gender[is.na(full$gender)] <- 'Male'
full$gender[full$gender == 'Other'] <- 'Male'
full$gender <- factor(full$gender, levels = c('Male', 'Female'))
table(full$gender)

full$relevent_experience <- ifelse(full$relevent_experience == 'Has relevent experience', 'Yes', 'No')
full$relevent_experience <- as.factor(full$relevent_experience)
full$relevent_experience <- relevel(full$relevent_experience, 'Yes') 

full$target <- ifelse(full$target == 1, 'Yes', 'No')
full$target <- as.factor(full$target)
full$target <- relevel(full$target, 'Yes') 


full$city <- as.character(full$city)
colSums(is.na(full))


################################################################################################


x_test <- x_test %>% fill(education_level, .direction = 'updown')
x_test <- x_test %>% fill(company_type, .direction = 'updown')
x_test <- x_test %>% fill(company_size, .direction = 'updown')
x_test <- x_test %>% fill(major_discipline, .direction = 'updown')
x_test <- x_test %>% fill(last_new_job, .direction = 'updown')
x_test <- x_test %>% fill(enrolled_university, .direction = 'updown')
x_test <- x_test %>% fill(experience, .direction = 'updown')


x_test$gender[is.na(x_test$gender)] <- 'Male'
x_test$gender[x_test$gender == 'Other'] <- 'Male'
x_test$gender <- factor(x_test$gender, levels = c('Male', 'Female'))
table(x_test$gender)

x_test$relevent_experience <- ifelse(x_test$relevent_experience == 'Has relevent experience', 'Yes', 'No')
x_test$relevent_experience <- as.factor(x_test$relevent_experience)
x_test$relevent_experience <- relevel(x_test$relevent_experience, 'Yes') 
x_test$city <- as.character(x_test$city)

colSums(is.na(x_test))

str(full)
str(x_test)

colSums(is.na(full))
colSums(is.na(x_test))

rf <- randomForest(
    target ~ . ,
    full,
    ntree = 300,
    do.trace = TRUE
)

auc(rf)

pred <- predict(
    rf, 
    newdata = x_test,
)

list <- ifelse(pred == 'Yes', 1, 0)
list <- as.factor(list)
list

result <- data.frame(
    x_test$enrollee_id,
    list
)

names(result) <- c("enrollee_id", 'target')
write.csv(result, "result.csv", row.names = F)

0개의 댓글