[R] 이기적 스터디 카페 5주차 예상문제

: ) YOUNG·2022년 6월 21일
1

빅분기

목록 보기
20/20
post-thumbnail


1유형 1

데이터 설명 : 의료비용 예측문제

Data url :https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/train.csv

Q1. 흡연자와 비흡연자 각각 charges의 상위 10% 그룹의 평균의 차이는?

library(dplyr)
library(tidyr)


main <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/train.csv',
    encoding = 'UTF-8',
    stringsAsFactor = TRUE
)

# Q1. 흡연자와 비흡연자 각각 charges의 상위 10% 그룹의 평균의 차이는?

smoking <- main %>% filter(smoker == 'yes')
smoking <- smoking[ order( -smoking$charges), ]
n <- nrow(smoking) * 0.1
smoking <- smoking[1:n, ]
smoking_mean <- mean(smoking$charges)
smoking_mean

noSmoking <- main %>% filter(smoker == 'no')
noSmoking <- noSmoking[ order( -noSmoking$charges), ]
n <- nrow(noSmoking) * 0.1
noSmoking <- noSmoking[1:n, ]
no_smoking_mean <- mean(noSmoking$charges)
no_smoking_mean

result1 <- abs(no_smoking_mean - smoking_mean)
print(result1)



> print(result1)
[1] 29297.95



2유형 1


의료비용 예측문제

평가지표 : r2 score

trainData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/train.csv

testData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/test.csv



library(dplyr)
library(tidyr)
library(caret)
library(ModelMetrics)
library(rpart)
library(scales)
library(readr)


# 데이터 설명 : 의료비용 예측문제# 평가지표 : r2 score

train <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/train.csv',
    encoding = 'UTF-8',
    na.strings = c('', ' ', "NA", NA),
    stringsAsFactor = TRUE
)

test <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/test.csv',
    encoding = 'UTF-8',
    na.strings = c('', ' ', "NA", NA),
    stringsAsFactor = TRUE
)

# 결측값은 없음



# 데이터 스케일링
model <- preProcess(
    train[, -c(7)],
    method = c('range')
)

train <- predict(
    model,
    train
)


model <- preProcess(
    test,
    method = c('range')
)

test <- predict(
    model,
    test
)


# 모델 성능 검증

set.seed(2100)
parts <- sample(
    1:nrow(train),
    size = nrow(train) * 0.7
)

t.train <- train[parts, ]
t.valid <- train[-parts, ]




rp_test <- rpart(
    charges ~ .,
    t.train
)
summary(rp_test)

rp_test_pred <- predict(
    rp_test,
    newdata = t.valid
)

RMSE(rp_test_pred , t.valid$charges)
caret::R2(rp_test_pred, t.valid$charges)
# > caret::R2(rp_test_pred, t.valid$charges)
# [1] 0.8418462



# 회귀 식 생성
glm_model <- lm(charges ~ ., train)
summary(glm_model)


# step wise
glm_model2 <- step(glm_model, direction = 'both')
su <- summary(glm_model2)
print(su$r.squared)

# F-statistic: 458.8 on 7 and 1062 DF,  p-value: < 2.2e-16
# Multiple R-squared:  0.7515,    Adjusted R-squared:  0.7499



rp <- rpart(
    charges ~ ., 
    train
)

rp.p <- predict(
    rp,
    newdata = test,
)

result <- data.frame(
    rp.p
)


names(result) <- c("charges")
head(result)
write.csv(result, 'resutl.csv', row.names = FALSE)


> head(result)
    charges
1 41999.202
2 12356.380
3  5422.149
4 12356.380
5 12356.380
6  5422.149


1유형 2

Q1. bedroom의 빈도가 가장 높은 값을 가지는 데이터들의 price의 상위 10%와 하위 10%값의 차이를 구하여라


library(dplyr)
library(tidyr)
library(plyr)

main <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/train.csv',
    encoding = 'UTF-8'
)


# Q1. bedroom의 빈도가 가장 높은 값을 가지는 데이터들의 price의 
# 상위 10%와 하위 10%값의 차이를 구하여라

main$bedrooms <- as.factor(main$bedrooms)
ta <- table(main$bedrooms)
ta <- as.data.frame(ta)
ta <- ta[order(-ta$Freq)  ,  ]
top <- ta$Var1[1]

main <- main %>% filter(bedrooms == top)
topth <- quantile(main$price, 0.1)
under <- quantile(main$price, 0.9)
result1 <- abs(topth - under)
print(result1)



> print(result1)
   10%
505500


2유형 2


킹카운티 주거지 가격 예측문제

평가지표 : r2 score

trainData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/train.csv

testData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/test.csv




library(dplyr)
library(caret)
library(tidyr)
library(rpart)
library(tidyr)
library(plyr)
library(scales)
library(ModelMetrics)
library(lubridate)


# 데이터 설명 : 킹카운티 주거지 가격 예측문제
# ​평가지표 : r2 score



train <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/train.csv',
    encoding = 'UTF-8',
    na.string = c('', ' ', 'NA', NA)
)

test <- read.csv(
    file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/test.csv',
    encoding = 'UTF-8',
    na.string = c('', ' ', 'NA', NA)
)




# id컬럼 불필요로 하므로 제거
train <- train[, -c(1)]


# 형변환
train$floors <- as.factor(train$floors)
test$floors <- as.factor(test$floors)

train$view <- as.factor(train$view)
test$view <- as.factor(test$view)

train$condition <- as.factor(train$condition)
test$condition <- as.factor(test$condition)

train$waterfront <- as.factor(train$waterfront)
test$waterfront <- as.factor(test$waterfront)

train$floors <- as.factor(train$floors)
test$floors <- as.factor(test$floors)


# factor형의 갯수가 맞는지 확인

temp <- train %>% select_if( ~ class(.) %in% c('factor'))
str(temp)

temp2 <- test %>% select_if( ~ class(.) %in% c('factor'))
str(temp2)

# Factor형의 요인수 6 5 5 로 같은 것을 확인

str(train)
model <- preProcess(
    train[, -2],
    method = c('range')
)

train <- predict(
    model,
    train
)

summary(train)


str(test)
model <- preProcess(
    test[, -1],
    method = c('range')
)

test <- predict(
    model,
    test
)

summary(test)


# 모델 성능 검증

set.seed(2100)
parts <- sample(
    1:nrow(train),
    size = nrow(train) * 0.7
)

t.train <- train[parts, ]
t.valid <- train[-parts, ]


test_model <- lm(price ~ ., t.train)

test_model <- step(test_model, diretion = 'both')
summary(test_model)


rp_test <- rpart(
    test_model,
    t.train
)
summary(rp_test)

rp_test_pred <- predict(
    rp_test,
    newdata = t.valid
)

RMSE(rp_test_pred , t.valid$price)
caret::R2(rp_test_pred, t.valid$price)

# > rmse(rp_test_pred , t.valid$price)
# [1] 213994.4





### fit

str(train)
model_glm <- lm(
    price ~ ., 
    train
)
summary(model_glm)

model_glm2 <- step(
    model_glm,
    diretion = both
)
summary(model_glm2)


rp <- rpart(
    model_glm2,
    train
)

rp.p <- predict(
    rp,
    newdata = test
)

rp.p <- round(rp.p)

result <- data.frame(
    test$id, 
    rp.p
)

names(result) <- c('id', 'price')
head(result)
write.csv(result, 'result2.csv', row.names = FALSE)




> head(result)
          id  price
1 1974300020 947269
2 1974300020 947269
3 3630020380 556394
4 1771000290 556394
5 5126310470 458887
6 1870400605 947269

0개의 댓글