(target: 1: 이직 , 0 : 이직 x)
library(dplyr)
library(caret)
library(randomForest)
library(ModelMetrics)
library(tidyr)
x_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_train.csv",
stringsAsFactor = TRUE,
na.strings = c("", "na", "NA", NA))
y_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/y_train.csv",
na.strings = c("", "na", "NA", NA))
x_test <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_test.csv",
stringsAsFactor = TRUE,
na.strings = c("", "na", "NA", NA))
nrow(x_train)
nrow(y_train)
nrow(x_test)
> nrow(x_train)
[1] 12452
> nrow(y_train)
[1] 12452
> nrow(x_test)
[1] 6706
summary(x_train)
summary(y_train)
> summary(x_train)
enrollee_id city city_development_index gender
Min. : 1 city_103:2839 Min. :0.4480 Female: 808
1st Qu.: 8592 city_21 :1713 1st Qu.:0.7400 Male :8595
Median :17178 city_16 :1005 Median :0.9100 Other : 132
Mean :16966 city_114: 854 Mean :0.8297 NA's :2917
3rd Qu.:25328 city_160: 557 3rd Qu.:0.9200
Max. :33380 city_136: 374 Max. :0.9490
(Other) :5110
relevent_experience enrolled_university
Has relevent experience:8953 Full time course:2431
No relevent experience :3499 no_enrollment :8975
Part time course: 789
NA's : 257
education_level major_discipline experience company_size
Graduate :7540 Arts : 157 >20 :2179 50-99 :2000
High School :1336 Business Degree: 195 4 : 945 100-500 :1672
Masters :2796 Humanities : 436 5 : 926 10000+ :1337
Phd : 269 No Major : 136 3 : 858 10/49 : 947
Primary School: 196 Other : 248 6 : 795 1000-4999: 852
NA's : 315 STEM :9414 (Other):6712 (Other) :1792
NA's :1866 NA's : 37 NA's :3852
company_type last_new_job training_hours
Early Stage Startup: 365 >4 :2182 Min. : 1.0
Funded Startup : 652 1 :5157 1st Qu.: 23.0
NGO : 340 2 :1895 Median : 47.0
Other : 81 3 : 685 Mean : 65.6
Public Sector : 620 4 : 676 3rd Qu.: 88.0
Pvt Ltd :6413 never:1584 Max. :336.0
NA's :3981 NA's : 273
> summary(y_train)
enrollee_id target
Min. : 1 Min. :0.0000
1st Qu.: 8592 1st Qu.:0.0000
Median :17178 Median :0.0000
Mean :16966 Mean :0.2494
3rd Qu.:25328 3rd Qu.:0.0000
Max. :33380 Max. :1.0000
colSums(is.na(x_train))
colSums(is.na(x_test))
colSums(is.na(y_train))
> colSums(is.na(x_train))
CustomerId Surname CreditScore Geography Gender
0 0 0 0 0
Age Tenure Balance NumOfProducts HasCrCard
0 0 0 0 0
IsActiveMember EstimatedSalary
0 0
> colSums(is.na(x_test))
CustomerId CreditScore Geography Gender Age
0 0 0 0 0
Tenure Balance NumOfProducts HasCrCard IsActiveMember
0 0 0 0 0
EstimatedSalary
0
> colSums(is.na(y_train))
CustomerId Exited
0 0
full <- full %>% fill(education_level, .direction = 'updown')
full <- full %>% fill(company_type, .direction = 'updown')
full <- full %>% fill(company_size, .direction = 'updown')
full <- full %>% fill(major_discipline, .direction = 'updown')
full <- full %>% fill(last_new_job, .direction = 'updown')
full <- full %>% fill(enrolled_university, .direction = 'updown')
full <- full %>% fill(experience, .direction = 'updown')
full$gender[is.na(full$gender)] <- 'Male'
full$gender[full$gender == 'Other'] <- 'Male'
full$gender <- factor(full$gender, levels = c('Male', 'Female'))
table(full$gender)
full$relevent_experience <- ifelse(full$relevent_experience == 'Has relevent experience', 'Yes', 'No')
full$relevent_experience <- as.factor(full$relevent_experience)
full$relevent_experience <- relevel(full$relevent_experience, 'Yes')
full$target <- ifelse(full$target == 1, 'Yes', 'No')
full$target <- as.factor(full$target)
full$target <- relevel(full$target, 'Yes')
full$city <- as.character(full$city)
colSums(is.na(full))
explain
tidyr패키지의 fill 함수를 사용해서 결측값인 값들을 위 아래의 값으로 대체했음
geder
컬럼은 'Male'과 'Female'의 비율을 확인해 본 결과 Mail의 비율이 압도적으로 높았다는 것을 확인하고 'Other' 컬럼과 결측값은 모두 'Male'로 대체했음
relevent_experience
컬럼은 문장이 너무 길어서 'Yes'와 'No'로 그냥 깔끔하게 변환해줬음target
은 목표값인데, 1, 0이 아닌 'Yes', 'No'로 변환해줬음 1이 이직을 한다이므로 'Yes' 설정해줬음city
컬럼은 factor형이었으나, levels 범위가 너무 넓어서 그냥 character형으로 바꿨음위의 과정을 x_test 데이터에도 동일하게 적용
x_test <- x_test %>% fill(education_level, .direction = 'updown')
x_test <- x_test %>% fill(company_type, .direction = 'updown')
x_test <- x_test %>% fill(company_size, .direction = 'updown')
x_test <- x_test %>% fill(major_discipline, .direction = 'updown')
x_test <- x_test %>% fill(last_new_job, .direction = 'updown')
x_test <- x_test %>% fill(enrolled_university, .direction = 'updown')
x_test <- x_test %>% fill(experience, .direction = 'updown')
x_test$gender[is.na(x_test$gender)] <- 'Male'
x_test$gender[x_test$gender == 'Other'] <- 'Male'
x_test$gender <- factor(x_test$gender, levels = c('Male', 'Female'))
table(x_test$gender)
x_test$relevent_experience <- ifelse(x_test$relevent_experience == 'Has relevent experience', 'Yes', 'No')
x_test$relevent_experience <- as.factor(x_test$relevent_experience)
x_test$relevent_experience <- relevel(x_test$relevent_experience, 'Yes')
x_test$city <- as.character(x_test$city)
변환 확인
> str(full)
'data.frame': 12452 obs. of 13 variables:
$ city : chr "city_103" "city_103" "city_67" "city_136" ...
$ city_development_index: num 0.92 0.92 0.855 0.897 0.92 0.91 0.92 0.624 0.899 0.924 ...
$ gender : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 1 1 1 1 1 ...
$ relevent_experience : Factor w/ 2 levels "Yes","No": 2 2 1 1 2 1 1 1 1 1 ...
$ enrolled_university : Factor w/ 3 levels "Full time course",..: 2 2 2 1 2 2 3 2 3 2 ...
$ education_level : Factor w/ 5 levels "Graduate","High School",..: 2 4 3 3 1 1 1 1 1 1 ...
$ major_discipline : Factor w/ 6 levels "Arts","Business Degree",..: 6 6 6 6 3 6 6 6 6 6 ...
$ experience : Factor w/ 22 levels "<1",">20","1",..: 14 9 4 7 7 2 2 7 18 2 ...
$ company_size : Factor w/ 8 levels "<10","10/49",..: 5 5 1 3 3 4 4 8 6 4 ...
$ company_type : Factor w/ 6 levels "Early Stage Startup",..: 6 3 1 2 2 6 6 6 5 6 ...
$ last_new_job : Factor w/ 6 levels ">4","1","2","3",..: 6 4 2 2 3 1 1 5 1 2 ...
$ training_hours : int 150 128 12 18 12 2 78 38 46 126 ...
$ target : Factor w/ 2 levels "Yes","No": 2 1 2 2 2 1 2 2 2 2 ...
> str(x_test)
'data.frame': 6706 obs. of 13 variables:
$ enrollee_id : int 7129 31037 22179 29724 17977 30328 9446 13697 1184 23961 ...
$ city : chr "city_23" "city_44" "city_103" "city_50" ...
$ city_development_index: num 0.899 0.725 0.92 0.896 0.689 0.624 0.848 0.698 0.926 0.92 ...
$ gender : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 1 1 1 1 1 ...
$ relevent_experience : Factor w/ 2 levels "Yes","No": 2 2 2 2 1 1 1 1 2 1 ...
$ enrolled_university : Factor w/ 3 levels "Full time course",..: 2 3 1 1 1 2 2 2 2 2 ...
$ education_level : Factor w/ 5 levels "Graduate","High School",..: 1 3 1 1 3 1 1 3 4 1 ...
$ major_discipline : Factor w/ 6 levels "Arts","Business Degree",..: 6 6 6 6 6 6 6 6 6 6 ...
$ experience : Factor w/ 22 levels "<1",">20","1",..: 14 18 20 20 13 8 19 2 2 8 ...
$ company_size : Factor w/ 8 levels "<10","10/49",..: 6 6 6 6 6 6 6 1 5 5 ...
$ company_type : Factor w/ 6 levels "Early Stage Startup",..: 6 6 6 6 6 2 2 5 6 6 ...
$ last_new_job : Factor w/ 6 levels ">4","1","2","3",..: 2 6 3 6 2 2 2 6 1 2 ...
$ training_hours : int 23 39 262 78 125 94 22 55 31 46 ...
> colSums(is.na(full))
city city_development_index gender
0 0 0
relevent_experience enrolled_university education_level
0 0 0
major_discipline experience company_size
0 0 0
company_type last_new_job training_hours
0 0 0
target
0
> colSums(is.na(x_test))
enrollee_id city city_development_index
0 0 0
gender relevent_experience enrolled_university
0 0 0
education_level major_discipline experience
0 0 0
company_size company_type last_new_job
0 0 0
training_hours
0
해당 데이터는 스케일링이 굳이 필요하지 않다고 판단해서 넘어가기로 함
rf <- randomForest(
target ~ . ,
full,
ntree = 300,
do.trace = TRUE
)
auc(rf)
pred <- predict(
rf,
newdata = x_test,
)
head(pred, 10)
> head(pred, 10)
1 2 3 4 5 6 7 8 9 10
No No No No No No No No No No
Levels: Yes No
list <- ifelse(pred == 'Yes', 1, 0)
list <- as.factor(list)
list
result <- data.frame(
x_test$enrollee_id,
list
)
> head(result)
x_test.enrollee_id list
1 7129 0
2 31037 0
3 22179 0
4 29724 0
5 17977 0
6 30328 0
names(result) <- c("enrollee_id", 'target')
write.csv(result, "result.csv", row.names = F)
Rtest <- read.csv('result.csv')
head(Rtest)
> head(Rtest)
enrollee_id target
1 7129 0
2 31037 0
3 22179 0
4 29724 0
5 17977 0
6 30328 0
library(dplyr)
library(caret)
library(randomForest)
library(ModelMetrics)
library(tidyr)
x_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_train.csv",
stringsAsFactor = TRUE,
na.strings = c("", "na", "NA", NA))
y_train <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/y_train.csv",
na.strings = c("", "na", "NA", NA))
x_test <- read.csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/HRdata/X_test.csv",
stringsAsFactor = TRUE,
na.strings = c("", "na", "NA", NA))
nrow(x_train)
nrow(y_train)
nrow(x_test)
summary(x_train)
summary(y_train)
# target 컬럼 예측하기
full <- merge(x_train, y_train, "enrollee_id")
full <- full[, -c(1)]
# full 결측값 제거
colSums(is.na(full))
full <- full %>% fill(education_level, .direction = 'updown')
full <- full %>% fill(company_type, .direction = 'updown')
full <- full %>% fill(company_size, .direction = 'updown')
full <- full %>% fill(major_discipline, .direction = 'updown')
full <- full %>% fill(last_new_job, .direction = 'updown')
full <- full %>% fill(enrolled_university, .direction = 'updown')
full <- full %>% fill(experience, .direction = 'updown')
full$gender[is.na(full$gender)] <- 'Male'
full$gender[full$gender == 'Other'] <- 'Male'
full$gender <- factor(full$gender, levels = c('Male', 'Female'))
table(full$gender)
full$relevent_experience <- ifelse(full$relevent_experience == 'Has relevent experience', 'Yes', 'No')
full$relevent_experience <- as.factor(full$relevent_experience)
full$relevent_experience <- relevel(full$relevent_experience, 'Yes')
full$target <- ifelse(full$target == 1, 'Yes', 'No')
full$target <- as.factor(full$target)
full$target <- relevel(full$target, 'Yes')
full$city <- as.character(full$city)
colSums(is.na(full))
################################################################################################
x_test <- x_test %>% fill(education_level, .direction = 'updown')
x_test <- x_test %>% fill(company_type, .direction = 'updown')
x_test <- x_test %>% fill(company_size, .direction = 'updown')
x_test <- x_test %>% fill(major_discipline, .direction = 'updown')
x_test <- x_test %>% fill(last_new_job, .direction = 'updown')
x_test <- x_test %>% fill(enrolled_university, .direction = 'updown')
x_test <- x_test %>% fill(experience, .direction = 'updown')
x_test$gender[is.na(x_test$gender)] <- 'Male'
x_test$gender[x_test$gender == 'Other'] <- 'Male'
x_test$gender <- factor(x_test$gender, levels = c('Male', 'Female'))
table(x_test$gender)
x_test$relevent_experience <- ifelse(x_test$relevent_experience == 'Has relevent experience', 'Yes', 'No')
x_test$relevent_experience <- as.factor(x_test$relevent_experience)
x_test$relevent_experience <- relevel(x_test$relevent_experience, 'Yes')
x_test$city <- as.character(x_test$city)
colSums(is.na(x_test))
str(full)
str(x_test)
colSums(is.na(full))
colSums(is.na(x_test))
rf <- randomForest(
target ~ . ,
full,
ntree = 300,
do.trace = TRUE
)
auc(rf)
pred <- predict(
rf,
newdata = x_test,
)
list <- ifelse(pred == 'Yes', 1, 0)
list <- as.factor(list)
list
result <- data.frame(
x_test$enrollee_id,
list
)
names(result) <- c("enrollee_id", 'target')
write.csv(result, "result.csv", row.names = F)