이기적 스터디 카페 1주차 예상문제
1유형 1
library(dplyr)
library(lubridate)
main <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/weather/weather2.csv',
fileEncoding = 'UTF-8-BOM'
)
ds1 <- main
temp <- ds1 %>% filter( month(time) == c('6', '7', '8') & 이화동기온 > 수영동기온)
result1 <- nrow(temp)
print(result1)
ds2 <- main
temp <- ds2 %>% filter(max(이화동강수) == 이화동강수 | max(수영동강수) == 수영동강수)
temp <- temp[order(-temp$이화동강수), ]
result2 <- temp$time
print(result2)
2유형 1
library(dplyr)
library(caret)
library(tidyr)
library(ModelMetrics)
library(readr)
library(randomForest)
train <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/churn/train.csv',
fileEncoding = 'UTF-8-BOM',
na.strings = c("", " ", "NA", NA)
)
test <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/churn/test.csv',
fileEncoding = 'UTF-8-BOM',
na.strings = c("", " ", "NA", NA)
)
sub <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/churn/submission.csv',
fileEncoding = 'UTF-8-BOM',
na.strings = c("", " ", "NA", NA)
)
summary(train)
head(train)
summary(test)
head(test)
train <- train[, -c(1,2,3)]
test <- test[, -c(1, 3)]
colSums(is.na(train))
colSums(is.na(test))
train$Geography <- as.factor(train$Geography)
test$Geography <- as.factor(test$Geography)
train$Exited <- ifelse(train$Exited == 0, 'No', 'Yes')
train$Exited <- as.factor(train$Exited)
train$HasCrCard <- ifelse(train$HasCrCard == 1, 'Yes', 'No')
train$HasCrCard <- as.factor(train$HasCrCard)
test$HasCrCard <- ifelse(test$HasCrCard == 1, 'Yes', 'No')
test$HasCrCard <- as.factor(test$HasCrCard)
train$IsActiveMember <- ifelse(train$IsActiveMember == 1, 'Yes', 'No')
train$IsActiveMember <- as.factor(train$IsActiveMember)
test$IsActiveMember <- ifelse(test$IsActiveMember == 1, 'Yes', 'No')
test$IsActiveMember <- as.factor(test$IsActiveMember)
train$NumOfProducts <- as.factor(as.character(train$NumOfProducts))
test$NumOfProducts <- as.factor(as.character(test$NumOfProducts))
train$Gender <- as.factor(train$Gender)
test$Gender <- as.factor(test$Gender)
train$Tenure <- as.factor(train$Tenure)
test$Tenure <- as.factor(test$Tenure)
model <- preProcess(
train,
method = c('range')
)
train <- predict(
model,
train
)
rf <- randomForest(
Exited ~ .,
train,
ntree = 400,
do.trace = TRUE
)
pred <- predict(
object = rf,
newdata = test
)
list <- sub
list <- ifelse(list == 0, 'No', 'Yes')
list <- as.factor(list)
levels(list) <- c('No', 'Yes')
auc(rf)
cm <- caret::confusionMatrix(
pred,
list,
mode = 'everything'
)
f1score <- cm$byClass['F1']
print(f1score)
head(pred)
pred <- ifelse(pred == 'No', 0, 1)
result <- data.frame(
test$CustomerId,
pred
)
names(result) <- c('CustomerId', 'Exited')
write.csv(result, 'result.csv', row.names = FALSE)
1유형 2
library(dplyr)
library(tidyr)
library(caret)
main <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/churn/train.csv',
fileEncoding = 'UTF-8-BOM'
)
ds1 <- main
ds1 <- ds1 %>% filter(Gender == 'Male' & Exited == 1)
temp <- aggregate(
Exited ~ Geography,
ds1,
sum
)
temp <- temp[order( -c(temp$Exited)), ]
print(temp[1,])
ds2 <- main
str(ds2)
result2 <- ds2 %>% filter(HasCrCard == 1 & IsActiveMember == 1) %>% summarise(mean = mean(Age))
print(result2)
ds3 <- main
me <- median(main$Balance)
result3 <- ds3 %>% filter(Balance >= me) %>% summarise(sd = sd(CreditScore))
print(result3)
2유형 2
library(dplyr)
library(caret)
library(randomForest)
library(e1071)
library(tidyr)
library(ModelMetrics)
train <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/cancer/train.csv',
fileEncoding = 'UTF-8-BOM',
na.strings = c("", " ", "NA", NA)
)
test <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/cancer/test.csv',
fileEncoding = 'UTF-8-BOM',
na.strings = c("", " ", "NA", NA)
)
sub <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/cancer/submission.csv',
fileEncoding = 'UTF-8-BOM',
na.strings = c("", " ", "NA", NA)
)
train <- train[, -c(1)]
train$diagnosis <- as.factor(train$diagnosis)
set.seed(2108)
parts <- sample(
1:nrow(train),
size = nrow(train) * 0.7
)
t.train <- train[parts, ]
t.valid <- train[-parts, ]
model <- preProcess(
t.train[, -1],
method = c('range')
)
sd.train <- predict(
model,
t.train
)
sd.valid <- predict(
model,
t.valid
)
rf <- randomForest(
diagnosis ~ .,
sd.train,
ntree = 400,
do.trace = TRUE
)
rf.p <- predict(
rf,
newdata = sd.valid,
)
caret::confusionMatrix(
rf.p,
sd.valid$diagnosis,
mode = 'everything'
)
sv <- svm(
diagnosis ~ .,
sd.train,
)
sv.p <- predict(
sv,
newdata = sd.valid,
)
caret::confusionMatrix(
sv.p,
sd.valid$diagnosis,
mode = 'everything'
)
last <- svm(
diagnosis ~ .,
train,
)
pred <- predict(
last,
newdata = test
)
result <- data.frame(
test$id,
pred
)
names(result) <- c('id', 'diagnosis')
head(result)