(target: 1: 이직 , 0 : 이직 x)
library(dplyr)
library(caret)
library(randomForest)
library(ModelMetrics)
library(tidyr)
x_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_train.csv",
stringsAsFactor = TRUE,
na.strings = c("", "na", "NA", NA))
y_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/y_train.csv",
na.strings = c("", "na", "NA", NA))
x_test <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_test.csv",
stringsAsFactor = TRUE,
na.strings = c("", "na", "NA", NA))
nrow(x_train)
nrow(y_train)
nrow(x_test)
> nrow(x_train)
[1] 12452
> nrow(y_train)
[1] 12452
> nrow(x_test)
[1] 6706
summary(x_train)
summary(y_train)
> summary(x_train)
enrollee_id city city_development_index gender
Min. : 1 city_103:2839 Min. :0.4480 Female: 808
1st Qu.: 8592 city_21 :1713 1st Qu.:0.7400 Male :8595
Median :17178 city_16 :1005 Median :0.9100 Other : 132
Mean :16966 city_114: 854 Mean :0.8297 NA's :2917
3rd Qu.:25328 city_160: 557 3rd Qu.:0.9200
Max. :33380 city_136: 374 Max. :0.9490
(Other) :5110
relevent_experience enrolled_university
Has relevent experience:8953 Full time course:2431
No relevent experience :3499 no_enrollment :8975
Part time course: 789
NA's : 257
education_level major_discipline experience company_size
Graduate :7540 Arts : 157 >20 :2179 50-99 :2000
High School :1336 Business Degree: 195 4 : 945 100-500 :1672
Masters :2796 Humanities : 436 5 : 926 10000+ :1337
Phd : 269 No Major : 136 3 : 858 10/49 : 947
Primary School: 196 Other : 248 6 : 795 1000-4999: 852
NA's : 315 STEM :9414 (Other):6712 (Other) :1792
NA's :1866 NA's : 37 NA's :3852
company_type last_new_job training_hours
Early Stage Startup: 365 >4 :2182 Min. : 1.0
Funded Startup : 652 1 :5157 1st Qu.: 23.0
NGO : 340 2 :1895 Median : 47.0
Other : 81 3 : 685 Mean : 65.6
Public Sector : 620 4 : 676 3rd Qu.: 88.0
Pvt Ltd :6413 never:1584 Max. :336.0
NA's :3981 NA's : 273
> summary(y_train)
enrollee_id target
Min. : 1 Min. :0.0000
1st Qu.: 8592 1st Qu.:0.0000
Median :17178 Median :0.0000
Mean :16966 Mean :0.2494
3rd Qu.:25328 3rd Qu.:0.0000
Max. :33380 Max. :1.0000
colSums(is.na(x_train))
colSums(is.na(x_test))
colSums(is.na(y_train))
> colSums(is.na(x_train))
CustomerId Surname CreditScore Geography Gender
0 0 0 0 0
Age Tenure Balance NumOfProducts HasCrCard
0 0 0 0 0
IsActiveMember EstimatedSalary
0 0
> colSums(is.na(x_test))
CustomerId CreditScore Geography Gender Age
0 0 0 0 0
Tenure Balance NumOfProducts HasCrCard IsActiveMember
0 0 0 0 0
EstimatedSalary
0
> colSums(is.na(y_train))
CustomerId Exited
0 0
full <- full %>% fill(education_level, .direction = 'updown')
full <- full %>% fill(company_type, .direction = 'updown')
full <- full %>% fill(company_size, .direction = 'updown')
full <- full %>% fill(major_discipline, .direction = 'updown')
full <- full %>% fill(last_new_job, .direction = 'updown')
full <- full %>% fill(enrolled_university, .direction = 'updown')
full <- full %>% fill(experience, .direction = 'updown')
full$gender[is.na(full$gender)] <- 'Male'
full$gender[full$gender == 'Other'] <- 'Male'
full$gender <- factor(full$gender, levels = c('Male', 'Female'))
table(full$gender)
full$relevent_experience <- ifelse(full$relevent_experience == 'Has relevent experience', 'Yes', 'No')
full$relevent_experience <- as.factor(full$relevent_experience)
full$relevent_experience <- relevel(full$relevent_experience, 'Yes')
full$target <- ifelse(full$target == 1, 'Yes', 'No')
full$target <- as.factor(full$target)
full$target <- relevel(full$target, 'Yes')
full$city <- as.character(full$city)
colSums(is.na(full))
explain
tidyr패키지의 fill 함수를 사용해서 결측값인 값들을 위 아래의 값으로 대체했음
geder 컬럼은 'Male'과 'Female'의 비율을 확인해 본 결과 Mail의 비율이 압도적으로 높았다는 것을 확인하고 'Other' 컬럼과 결측값은 모두 'Male'로 대체했음
relevent_experience 컬럼은 문장이 너무 길어서 'Yes'와 'No'로 그냥 깔끔하게 변환해줬음target은 목표값인데, 1, 0이 아닌 'Yes', 'No'로 변환해줬음 1이 이직을 한다이므로 'Yes' 설정해줬음city 컬럼은 factor형이었으나, levels 범위가 너무 넓어서 그냥 character형으로 바꿨음위의 과정을 x_test 데이터에도 동일하게 적용
x_test <- x_test %>% fill(education_level, .direction = 'updown')
x_test <- x_test %>% fill(company_type, .direction = 'updown')
x_test <- x_test %>% fill(company_size, .direction = 'updown')
x_test <- x_test %>% fill(major_discipline, .direction = 'updown')
x_test <- x_test %>% fill(last_new_job, .direction = 'updown')
x_test <- x_test %>% fill(enrolled_university, .direction = 'updown')
x_test <- x_test %>% fill(experience, .direction = 'updown')
x_test$gender[is.na(x_test$gender)] <- 'Male'
x_test$gender[x_test$gender == 'Other'] <- 'Male'
x_test$gender <- factor(x_test$gender, levels = c('Male', 'Female'))
table(x_test$gender)
x_test$relevent_experience <- ifelse(x_test$relevent_experience == 'Has relevent experience', 'Yes', 'No')
x_test$relevent_experience <- as.factor(x_test$relevent_experience)
x_test$relevent_experience <- relevel(x_test$relevent_experience, 'Yes')
x_test$city <- as.character(x_test$city)
변환 확인
> str(full)
'data.frame': 12452 obs. of 13 variables:
$ city : chr "city_103" "city_103" "city_67" "city_136" ...
$ city_development_index: num 0.92 0.92 0.855 0.897 0.92 0.91 0.92 0.624 0.899 0.924 ...
$ gender : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 1 1 1 1 1 ...
$ relevent_experience : Factor w/ 2 levels "Yes","No": 2 2 1 1 2 1 1 1 1 1 ...
$ enrolled_university : Factor w/ 3 levels "Full time course",..: 2 2 2 1 2 2 3 2 3 2 ...
$ education_level : Factor w/ 5 levels "Graduate","High School",..: 2 4 3 3 1 1 1 1 1 1 ...
$ major_discipline : Factor w/ 6 levels "Arts","Business Degree",..: 6 6 6 6 3 6 6 6 6 6 ...
$ experience : Factor w/ 22 levels "<1",">20","1",..: 14 9 4 7 7 2 2 7 18 2 ...
$ company_size : Factor w/ 8 levels "<10","10/49",..: 5 5 1 3 3 4 4 8 6 4 ...
$ company_type : Factor w/ 6 levels "Early Stage Startup",..: 6 3 1 2 2 6 6 6 5 6 ...
$ last_new_job : Factor w/ 6 levels ">4","1","2","3",..: 6 4 2 2 3 1 1 5 1 2 ...
$ training_hours : int 150 128 12 18 12 2 78 38 46 126 ...
$ target : Factor w/ 2 levels "Yes","No": 2 1 2 2 2 1 2 2 2 2 ...
> str(x_test)
'data.frame': 6706 obs. of 13 variables:
$ enrollee_id : int 7129 31037 22179 29724 17977 30328 9446 13697 1184 23961 ...
$ city : chr "city_23" "city_44" "city_103" "city_50" ...
$ city_development_index: num 0.899 0.725 0.92 0.896 0.689 0.624 0.848 0.698 0.926 0.92 ...
$ gender : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 1 1 1 1 1 ...
$ relevent_experience : Factor w/ 2 levels "Yes","No": 2 2 2 2 1 1 1 1 2 1 ...
$ enrolled_university : Factor w/ 3 levels "Full time course",..: 2 3 1 1 1 2 2 2 2 2 ...
$ education_level : Factor w/ 5 levels "Graduate","High School",..: 1 3 1 1 3 1 1 3 4 1 ...
$ major_discipline : Factor w/ 6 levels "Arts","Business Degree",..: 6 6 6 6 6 6 6 6 6 6 ...
$ experience : Factor w/ 22 levels "<1",">20","1",..: 14 18 20 20 13 8 19 2 2 8 ...
$ company_size : Factor w/ 8 levels "<10","10/49",..: 6 6 6 6 6 6 6 1 5 5 ...
$ company_type : Factor w/ 6 levels "Early Stage Startup",..: 6 6 6 6 6 2 2 5 6 6 ...
$ last_new_job : Factor w/ 6 levels ">4","1","2","3",..: 2 6 3 6 2 2 2 6 1 2 ...
$ training_hours : int 23 39 262 78 125 94 22 55 31 46 ...
> colSums(is.na(full))
city city_development_index gender
0 0 0
relevent_experience enrolled_university education_level
0 0 0
major_discipline experience company_size
0 0 0
company_type last_new_job training_hours
0 0 0
target
0
> colSums(is.na(x_test))
enrollee_id city city_development_index
0 0 0
gender relevent_experience enrolled_university
0 0 0
education_level major_discipline experience
0 0 0
company_size company_type last_new_job
0 0 0
training_hours
0
해당 데이터는 스케일링이 굳이 필요하지 않다고 판단해서 넘어가기로 함
rf <- randomForest(
target ~ . ,
full,
ntree = 300,
do.trace = TRUE
)
auc(rf)
pred <- predict(
rf,
newdata = x_test,
)
head(pred, 10)
> head(pred, 10)
1 2 3 4 5 6 7 8 9 10
No No No No No No No No No No
Levels: Yes No
list <- ifelse(pred == 'Yes', 1, 0)
list <- as.factor(list)
list
result <- data.frame(
x_test$enrollee_id,
list
)
> head(result)
x_test.enrollee_id list
1 7129 0
2 31037 0
3 22179 0
4 29724 0
5 17977 0
6 30328 0
names(result) <- c("enrollee_id", 'target')
write.csv(result, "result.csv", row.names = F)
Rtest <- read.csv('result.csv')
head(Rtest)
> head(Rtest)
enrollee_id target
1 7129 0
2 31037 0
3 22179 0
4 29724 0
5 17977 0
6 30328 0
library(dplyr)
library(caret)
library(randomForest)
library(ModelMetrics)
library(tidyr)
x_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_train.csv",
stringsAsFactor = TRUE,
na.strings = c("", "na", "NA", NA))
y_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/y_train.csv",
na.strings = c("", "na", "NA", NA))
x_test <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_test.csv",
stringsAsFactor = TRUE,
na.strings = c("", "na", "NA", NA))
nrow(x_train)
nrow(y_train)
nrow(x_test)
summary(x_train)
summary(y_train)
# target 컬럼 예측하기
full <- merge(x_train, y_train, "enrollee_id")
full <- full[, -c(1)]
# full 결측값 제거
colSums(is.na(full))
full <- full %>% fill(education_level, .direction = 'updown')
full <- full %>% fill(company_type, .direction = 'updown')
full <- full %>% fill(company_size, .direction = 'updown')
full <- full %>% fill(major_discipline, .direction = 'updown')
full <- full %>% fill(last_new_job, .direction = 'updown')
full <- full %>% fill(enrolled_university, .direction = 'updown')
full <- full %>% fill(experience, .direction = 'updown')
full$gender[is.na(full$gender)] <- 'Male'
full$gender[full$gender == 'Other'] <- 'Male'
full$gender <- factor(full$gender, levels = c('Male', 'Female'))
table(full$gender)
full$relevent_experience <- ifelse(full$relevent_experience == 'Has relevent experience', 'Yes', 'No')
full$relevent_experience <- as.factor(full$relevent_experience)
full$relevent_experience <- relevel(full$relevent_experience, 'Yes')
full$target <- ifelse(full$target == 1, 'Yes', 'No')
full$target <- as.factor(full$target)
full$target <- relevel(full$target, 'Yes')
full$city <- as.character(full$city)
colSums(is.na(full))
################################################################################################
x_test <- x_test %>% fill(education_level, .direction = 'updown')
x_test <- x_test %>% fill(company_type, .direction = 'updown')
x_test <- x_test %>% fill(company_size, .direction = 'updown')
x_test <- x_test %>% fill(major_discipline, .direction = 'updown')
x_test <- x_test %>% fill(last_new_job, .direction = 'updown')
x_test <- x_test %>% fill(enrolled_university, .direction = 'updown')
x_test <- x_test %>% fill(experience, .direction = 'updown')
x_test$gender[is.na(x_test$gender)] <- 'Male'
x_test$gender[x_test$gender == 'Other'] <- 'Male'
x_test$gender <- factor(x_test$gender, levels = c('Male', 'Female'))
table(x_test$gender)
x_test$relevent_experience <- ifelse(x_test$relevent_experience == 'Has relevent experience', 'Yes', 'No')
x_test$relevent_experience <- as.factor(x_test$relevent_experience)
x_test$relevent_experience <- relevel(x_test$relevent_experience, 'Yes')
x_test$city <- as.character(x_test$city)
colSums(is.na(x_test))
str(full)
str(x_test)
colSums(is.na(full))
colSums(is.na(x_test))
rf <- randomForest(
target ~ . ,
full,
ntree = 300,
do.trace = TRUE
)
auc(rf)
pred <- predict(
rf,
newdata = x_test,
)
list <- ifelse(pred == 'Yes', 1, 0)
list <- as.factor(list)
list
result <- data.frame(
x_test$enrollee_id,
list
)
names(result) <- c("enrollee_id", 'target')
write.csv(result, "result.csv", row.names = F)