데이터 설명 : 의료비용 예측문제
Data url :https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/train.csv
library(dplyr)
library(tidyr)
main <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/train.csv',
encoding = 'UTF-8',
stringsAsFactor = TRUE
)
# Q1. 흡연자와 비흡연자 각각 charges의 상위 10% 그룹의 평균의 차이는?
smoking <- main %>% filter(smoker == 'yes')
smoking <- smoking[ order( -smoking$charges), ]
n <- nrow(smoking) * 0.1
smoking <- smoking[1:n, ]
smoking_mean <- mean(smoking$charges)
smoking_mean
noSmoking <- main %>% filter(smoker == 'no')
noSmoking <- noSmoking[ order( -noSmoking$charges), ]
n <- nrow(noSmoking) * 0.1
noSmoking <- noSmoking[1:n, ]
no_smoking_mean <- mean(noSmoking$charges)
no_smoking_mean
result1 <- abs(no_smoking_mean - smoking_mean)
print(result1)
> print(result1)
[1] 29297.95
평가지표 : r2 score
trainData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/train.csv
testData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/test.csv
library(dplyr)
library(tidyr)
library(caret)
library(ModelMetrics)
library(rpart)
library(scales)
library(readr)
# 데이터 설명 : 의료비용 예측문제
# 평가지표 : r2 score
train <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/train.csv',
encoding = 'UTF-8',
na.strings = c('', ' ', "NA", NA),
stringsAsFactor = TRUE
)
test <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/test.csv',
encoding = 'UTF-8',
na.strings = c('', ' ', "NA", NA),
stringsAsFactor = TRUE
)
# 결측값은 없음
# 데이터 스케일링
model <- preProcess(
train[, -c(7)],
method = c('range')
)
train <- predict(
model,
train
)
model <- preProcess(
test,
method = c('range')
)
test <- predict(
model,
test
)
# 모델 성능 검증
set.seed(2100)
parts <- sample(
1:nrow(train),
size = nrow(train) * 0.7
)
t.train <- train[parts, ]
t.valid <- train[-parts, ]
rp_test <- rpart(
charges ~ .,
t.train
)
summary(rp_test)
rp_test_pred <- predict(
rp_test,
newdata = t.valid
)
RMSE(rp_test_pred , t.valid$charges)
caret::R2(rp_test_pred, t.valid$charges)
# > caret::R2(rp_test_pred, t.valid$charges)
# [1] 0.8418462
# 회귀 식 생성
glm_model <- lm(charges ~ ., train)
summary(glm_model)
# step wise
glm_model2 <- step(glm_model, direction = 'both')
su <- summary(glm_model2)
print(su$r.squared)
# F-statistic: 458.8 on 7 and 1062 DF, p-value: < 2.2e-16
# Multiple R-squared: 0.7515, Adjusted R-squared: 0.7499
rp <- rpart(
charges ~ .,
train
)
rp.p <- predict(
rp,
newdata = test,
)
result <- data.frame(
rp.p
)
names(result) <- c("charges")
head(result)
write.csv(result, 'resutl.csv', row.names = FALSE)
> head(result)
charges
1 41999.202
2 12356.380
3 5422.149
4 12356.380
5 12356.380
6 5422.149
library(dplyr)
library(tidyr)
library(plyr)
main <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/train.csv',
encoding = 'UTF-8'
)
# Q1. bedroom의 빈도가 가장 높은 값을 가지는 데이터들의 price의
# 상위 10%와 하위 10%값의 차이를 구하여라
main$bedrooms <- as.factor(main$bedrooms)
ta <- table(main$bedrooms)
ta <- as.data.frame(ta)
ta <- ta[order(-ta$Freq) , ]
top <- ta$Var1[1]
main <- main %>% filter(bedrooms == top)
topth <- quantile(main$price, 0.1)
under <- quantile(main$price, 0.9)
result1 <- abs(topth - under)
print(result1)
> print(result1)
10%
505500
평가지표 : r2 score
trainData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/train.csv
testData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/test.csv
library(dplyr)
library(caret)
library(tidyr)
library(rpart)
library(tidyr)
library(plyr)
library(scales)
library(ModelMetrics)
library(lubridate)
# 데이터 설명 : 킹카운티 주거지 가격 예측문제
# 평가지표 : r2 score
train <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/train.csv',
encoding = 'UTF-8',
na.string = c('', ' ', 'NA', NA)
)
test <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/test.csv',
encoding = 'UTF-8',
na.string = c('', ' ', 'NA', NA)
)
# id컬럼 불필요로 하므로 제거
train <- train[, -c(1)]
# 형변환
train$floors <- as.factor(train$floors)
test$floors <- as.factor(test$floors)
train$view <- as.factor(train$view)
test$view <- as.factor(test$view)
train$condition <- as.factor(train$condition)
test$condition <- as.factor(test$condition)
train$waterfront <- as.factor(train$waterfront)
test$waterfront <- as.factor(test$waterfront)
train$floors <- as.factor(train$floors)
test$floors <- as.factor(test$floors)
# factor형의 갯수가 맞는지 확인
temp <- train %>% select_if( ~ class(.) %in% c('factor'))
str(temp)
temp2 <- test %>% select_if( ~ class(.) %in% c('factor'))
str(temp2)
# Factor형의 요인수 6 5 5 로 같은 것을 확인
str(train)
model <- preProcess(
train[, -2],
method = c('range')
)
train <- predict(
model,
train
)
summary(train)
str(test)
model <- preProcess(
test[, -1],
method = c('range')
)
test <- predict(
model,
test
)
summary(test)
# 모델 성능 검증
set.seed(2100)
parts <- sample(
1:nrow(train),
size = nrow(train) * 0.7
)
t.train <- train[parts, ]
t.valid <- train[-parts, ]
test_model <- lm(price ~ ., t.train)
test_model <- step(test_model, diretion = 'both')
summary(test_model)
rp_test <- rpart(
test_model,
t.train
)
summary(rp_test)
rp_test_pred <- predict(
rp_test,
newdata = t.valid
)
RMSE(rp_test_pred , t.valid$price)
caret::R2(rp_test_pred, t.valid$price)
# > rmse(rp_test_pred , t.valid$price)
# [1] 213994.4
### fit
str(train)
model_glm <- lm(
price ~ .,
train
)
summary(model_glm)
model_glm2 <- step(
model_glm,
diretion = both
)
summary(model_glm2)
rp <- rpart(
model_glm2,
train
)
rp.p <- predict(
rp,
newdata = test
)
rp.p <- round(rp.p)
result <- data.frame(
test$id,
rp.p
)
names(result) <- c('id', 'price')
head(result)
write.csv(result, 'result2.csv', row.names = FALSE)
> head(result)
id price
1 1974300020 947269
2 1974300020 947269
3 3630020380 556394
4 1771000290 556394
5 5126310470 458887
6 1870400605 947269