[R] 이기적 스터디 카페 1주차 예상문제

: ) YOUNG·2022년 6월 16일
1

빅분기

목록 보기
17/20
post-thumbnail

이기적 스터디 카페 1주차 예상문제


1유형 1


library(dplyr)
library(lubridate)

main <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/weather/weather2.csv',
    fileEncoding = 'UTF-8-BOM'
)


# Q1. 여름철(6월,7월,8월) 이화동이 수영동보다 높은 기온을 가진 시간대는 몇개인가?

ds1 <- main
temp <- ds1 %>% filter( month(time) == c('6', '7', '8') & 이화동기온 > 수영동기온)
result1 <- nrow(temp)
print(result1)


# Q2. 이화동과 수영동의 최대강수량의 시간대를 각각 구하여라

ds2 <- main
temp <- ds2 %>% filter(max(이화동강수) == 이화동강수 | max(수영동강수) == 수영동강수)
temp <- temp[order(-temp$이화동강수), ]
result2 <- temp$time
print(result2)



2유형 1


library(dplyr)
library(caret)
library(tidyr)
library(ModelMetrics)
library(readr)
library(randomForest)


train <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/churn/train.csv',
    fileEncoding = 'UTF-8-BOM',
    na.strings = c("", " ", "NA", NA)
)

test <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/churn/test.csv',
    fileEncoding = 'UTF-8-BOM',
    na.strings = c("", " ", "NA", NA)
)

sub <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/churn/submission.csv',
    fileEncoding = 'UTF-8-BOM',
    na.strings = c("", " ", "NA", NA)
)


# 종속변수 Exited 예측
summary(train)
head(train)
summary(test)
head(test)
train <- train[,  -c(1,2,3)]
test <- test[,  -c(1, 3)]

#결측값 없음
colSums(is.na(train))
colSums(is.na(test))

train$Geography <- as.factor(train$Geography)
test$Geography <- as.factor(test$Geography)

train$Exited <- ifelse(train$Exited == 0, 'No', 'Yes')
train$Exited <- as.factor(train$Exited)

train$HasCrCard <- ifelse(train$HasCrCard == 1, 'Yes', 'No')
train$HasCrCard <- as.factor(train$HasCrCard)

test$HasCrCard <- ifelse(test$HasCrCard == 1, 'Yes', 'No')
test$HasCrCard <- as.factor(test$HasCrCard)

train$IsActiveMember <- ifelse(train$IsActiveMember == 1, 'Yes', 'No')
train$IsActiveMember <- as.factor(train$IsActiveMember)

test$IsActiveMember <- ifelse(test$IsActiveMember == 1, 'Yes', 'No')
test$IsActiveMember <- as.factor(test$IsActiveMember)

train$NumOfProducts <- as.factor(as.character(train$NumOfProducts))
test$NumOfProducts <- as.factor(as.character(test$NumOfProducts))

train$Gender <- as.factor(train$Gender)
test$Gender <- as.factor(test$Gender)

train$Tenure <- as.factor(train$Tenure)
test$Tenure <- as.factor(test$Tenure)



# 데이터 스케일링

model <- preProcess(
    train,
    method = c('range')
)

train <- predict(
    model,
    train
)


rf <- randomForest(
    Exited ~ .,
    train,
    ntree = 400,
    do.trace = TRUE
)

pred <- predict(
    object = rf,
    newdata = test
)


list <- sub
list <- ifelse(list == 0, 'No', 'Yes')
list <- as.factor(list)
levels(list) <- c('No', 'Yes')

auc(rf)

cm <- caret::confusionMatrix(
    pred,
    list,
    mode = 'everything'
)

f1score <- cm$byClass['F1']
print(f1score)
head(pred)

pred <- ifelse(pred == 'No', 0, 1)

result <- data.frame(
    test$CustomerId,
    pred
)

names(result) <- c('CustomerId', 'Exited')
write.csv(result, 'result.csv', row.names = FALSE)


1유형 2


library(dplyr)
library(tidyr)
library(caret)


main <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/churn/train.csv',
    fileEncoding = 'UTF-8-BOM'
)


# Q1.  남성 이탈(Exited)이 가장 많은 국가(Geography)는 어디이고 이탈 인원은 몇명인가?

ds1 <- main
ds1 <- ds1 %>% filter(Gender == 'Male' & Exited == 1)
temp <- aggregate(
    Exited ~ Geography,
    ds1,
    sum
)

temp <- temp[order( -c(temp$Exited)), ]
print(temp[1,])


# Q2. 카드를 소유(HasCrCard ==1)하고 있으면서 활성멤버(IsActiveMember ==1) 인 고객들의 평균나이는? 

ds2 <- main
str(ds2)
result2 <- ds2 %>% filter(HasCrCard == 1 & IsActiveMember == 1) %>% summarise(mean = mean(Age))
print(result2)



# Q3. Balance 값이 중간값 이상을 가지는 고객들의 CreditScore의 표준편차를 구하여라
ds3 <- main
me <- median(main$Balance)
result3 <- ds3 %>% filter(Balance >= me) %>% summarise(sd = sd(CreditScore))
print(result3)



2유형 2



library(dplyr)
library(caret)
library(randomForest)
library(e1071)
library(tidyr)
library(ModelMetrics)


# 종속변수 발생 여부 예측 (종속변수 diagnosis : B(양성), M(악성))

train <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/cancer/train.csv',
    fileEncoding = 'UTF-8-BOM',
    na.strings = c("", " ", "NA", NA)
)

test <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/cancer/test.csv',
    fileEncoding = 'UTF-8-BOM',
    na.strings = c("", " ", "NA", NA)
)

sub <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/cancer/submission.csv',
    fileEncoding = 'UTF-8-BOM',
    na.strings = c("", " ", "NA", NA)
)


train <- train[, -c(1)]
train$diagnosis <- as.factor(train$diagnosis)


#  데이터 검증 테스트

set.seed(2108)
parts <- sample(
    1:nrow(train),
    size = nrow(train) * 0.7
)

t.train <- train[parts, ]
t.valid <- train[-parts, ]

model <- preProcess(
    t.train[, -1],
    method = c('range')
)

sd.train <- predict(
    model,
    t.train
)

sd.valid <- predict(
    model,
    t.valid
)

rf <- randomForest(
    diagnosis ~ .,
    sd.train,
    ntree = 400,
    do.trace = TRUE
)

rf.p <- predict(
    rf,
    newdata = sd.valid,
)

caret::confusionMatrix(
    rf.p,
    sd.valid$diagnosis,
    mode = 'everything'
)

# Accuracy : 0.9708


sv <- svm(
    diagnosis ~ .,
    sd.train,
)

sv.p <- predict(
    sv,
    newdata = sd.valid,
)

caret::confusionMatrix(
    sv.p,
    sd.valid$diagnosis,
    mode = 'everything'
)

#  Accuracy : 0.9927

# svm 승리

last <- svm(
    diagnosis ~ .,
    train,
)

pred <- predict(
    last,
    newdata = test
)

result <- data.frame(
    test$id,
    pred
)


names(result) <- c('id', 'diagnosis')
head(result)

0개의 댓글