data = https://raw.githubusercontent.com/Datamanim/datarepo/main/insurance/train.csv
library(dplyr)
main <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/insurance/train.csv',
encoding = 'UTF-8'
)
# Q1. Vehicle_Age 값이 2년 이상인 사람들만 필터링 하고 그중에서
# Annual_Premium 값이 전체 데이터의 중간값 이상인 사람들을 찾고, 그들의 Vintage값의 평균을 구하여라
ds1 <- main %>% filter(Vehicle_Age == '> 2 Years')
me <- median(main$Annual_Premium)
result1 <- ds1 %>% filter(Annual_Premium >= me) %>% summarise(mean = mean(Vintage))
print(result1)
> print(result1)
mean
1 154.4365
ds2 <- main
temp <- aggregate(
Annual_Premium ~ Vehicle_Age + Gender,
ds2,
mean
)
library(reshape2)
result2 <- dcast(temp, Vehicle_Age ~ Gender, value.var = 'Annual_Premium' )
print(result2)
> result2
Vehicle_Age Female Male
1 < 1 Year 29972.29 30310.98
2 > 2 Years 36108.37 35303.87
3 1-2 Year 30762.25 30413.09
평가지표 : f1-score
trainData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/heart/train.csv
testData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/heart/test.csv
subData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/heart/submission.csv
library(dplyr)
library(caret)
library(randomForest)
library(ModelMetrics)
library(tidyr)
library(readr)
# 데이터 설명 : 심장질환예측 target 컬럼
train <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/heart/train.csv',
encoding = 'UTF-8',
na.strings = c('', ' ', 'NA', NA)
)
test <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/heart/test.csv',
encoding = 'UTF-8',
na.strings = c('', ' ', 'NA', NA)
)
sub <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/heart/submission.csv',
encoding = 'UTF-8',
na.strings = c('', ' ', 'NA', NA)
)
# 결측값 없음
colSums(is.na(train))
train$sex <- as.factor(ifelse(train$sex == 1 , 'male', 'female'))
test$sex <- as.factor(ifelse(test$sex == 1 , 'male', 'female'))
train$exang <- as.factor(ifelse(train$exang == 1 , 'yes', 'no'))
test$exang <- as.factor(ifelse(test$exang == 1 , 'yes', 'no'))
train$fbs <- as.factor(ifelse(train$fbs == 1 , 'true', 'false'))
test$fbs <- as.factor(ifelse(test$fbs == 1 , 'true', 'false'))
train$thal <- as.factor(ifelse(train$thal == 1 , 'normal', ifelse(train$thal == 2, 'fixed', 'reversable') ))
test$thal <- as.factor(ifelse(test$thal == 1 , 'normal', ifelse(test$thal == 2, 'fixed', 'reversable') ))
train$restecg <- as.factor(train$restecg)
test$restecg <- as.factor(test$restecg)
train$cp <- as.factor(train$cp)
test$cp <- as.factor(test$cp)
train$slope <- as.factor(train$slope)
test$slope <- as.factor(test$slope)
train$ca <- as.factor(train$ca)
test$ca <- as.factor(test$ca)
train$target <- as.factor(train$target)
str(train)
model <- preProcess(
train[, -c(1)],
method = c('range')
)
train <- predict(
model,
train
)
model <- preProcess(
test[, -c(1)],
method = c('range')
)
test <- predict(
model,
test
)
rf <- randomForest(
target ~ .,
train,
do.trace = TRUE,
ntree = 400
)
pred <- predict(
rf,
newdata = test
)
sub$X0 <- as.factor(sub$X0)
levels(sub$X0) <- c('0', '1')
caret::confusionMatrix(
pred,
sub$X0 ,
positive = '1',
mode = 'everything'
)
result <- data.frame(
pred
)
names(result) <- 'target'
write.csv(result, "result.csv", row.names = F)
> str(train)
'data.frame': 242 obs. of 14 variables:
$ age : int 60 51 51 59 60 46 54 45 54 44 ...
$ sex : Factor w/ 2 levels "female","male": 2 1 2 2 2 1 2 1 2 1 ...
$ cp : Factor w/ 4 levels "0","1","2","3": 1 3 4 3 1 3 1 2 2 3 ...
$ trestbps: num 0.434 0.245 0.292 0.528 0.292 ...
$ chol : num 0.381 0.386 0.199 0.196 0.301 ...
$ fbs : Factor w/ 2 levels "false","true": 1 1 1 2 1 1 1 1 1 1 ...
$ restecg : Factor w/ 3 levels "0","1","2": 1 1 1 2 1 1 2 1 2 2 ...
$ thalach : num 0.762 0.638 0.333 0.638 0.486 ...
$ exang : Factor w/ 2 levels "no","yes": 1 1 2 1 2 2 1 1 1 1 ...
$ oldpeak : num 0.214 0.107 0.25 0.286 0.5 ...
$ slope : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 1 2 2 3 2 ...
$ ca : Factor w/ 5 levels "0","1","2","3",..: 3 1 2 1 2 1 2 1 1 1 ...
$ thal : Factor w/ 3 levels "fixed","normal",..: 3 1 1 1 3 1 3 1 3 1 ...
$ target : Factor w/ 2 levels "0","1": 1 2 2 2 1 2 1 2 2 2 ...
> str(test)
'data.frame': 61 obs. of 13 variables:
$ age : int 62 54 64 56 40 41 58 51 54 64 ...
$ sex : Factor w/ 2 levels "female","male": 1 2 1 2 2 1 2 1 2 2 ...
$ cp : Factor w/ 4 levels "0","1","2","3": 1 3 1 3 1 3 1 3 1 4 ...
$ trestbps: num 0.3 0.25 1 0.375 0.125 0.15 0.575 0.25 0.5 0.125 ...
$ chol : num 0.295 0.47 0.708 0.463 0.146 ...
$ fbs : Factor w/ 2 levels "false","true": 1 1 1 2 1 1 1 1 1 1 ...
$ restecg : Factor w/ 3 levels "0","1","2": 2 1 2 1 1 1 2 1 2 1 ...
$ thalach : num 0.716 0.519 0.605 0.457 0.111 ...
$ exang : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 1 2 ...
$ oldpeak : num 0 0.1 0 0.15 0.5 0 0.5 0.15 0.3 0.45 ...
$ slope : Factor w/ 3 levels "0","1","2": 3 2 3 2 2 3 2 3 3 2 ...
$ ca : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 2 1 1 2 1 1 1 ...
$ thal : Factor w/ 3 levels "fixed","normal",..: 1 3 1 2 3 1 3 1 1 1 ...
> head(result, 10)
target
1 1
2 1
3 1
4 0
5 0
6 1
7 0
8 1
9 1
10 1
library(dplyr)
library(tidyr)
main <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/mobile/train.csv',
encoding = 'UTF-8'
)
# Q1. price_range 의 각 value를 그룹핑하여 각 그룹의 n_cores 의 빈도가 가장높은 value와 그 빈도수를 구하여라
ds1 <- main
result1 <- ds1 %>% group_by(price_range, n_cores) %>% summarise(n = n()) %>% arrange(desc(n)) %>% slice(1)
print(result1)
> print(result1)
# A tibble: 4 x 3
# Groups: price_range [4]
price_range n_cores n
<int> <int> <int>
1 0 2 69
2 1 1 76
3 2 4 73
4 3 5 70
문제타입 : 분류유형
평가지표 : accuracy
trainData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/mobile/train.csv
testData url : https://raw.githubusercontent.com/Datamanim/datarepo/main/mobile/test.csv
library(dplyr)
library(caret)
library(ModelMetrics)
library(scales)
library(rpart)
# 데이터 설명 : 핸드폰 가격예측 (price_range컬럼 0(저렴) ~3(매우비쌈) 범위 )
train <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/mobile/train.csv',
encoding = 'UTF-8',
na.strings = c('', ' ', NA)
)
test <- read.csv(
file = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/mobile/test.csv',
encoding = 'UTF-8',
na.strings = c('', ' ', NA)
)
train$battery_power <- normal(train$battery_power)
head(train$battery_power)
str(train)
unique(train$m_dep)
# 종속변수
train$price_range <- as.factor(train$price_range)
# blue
train$blue <- as.factor(train$blue)
test$blue <- as.factor(test$blue)
# three_g
train$three_g <- as.factor(train$three_g)
test$three_g <- as.factor(test$three_g)
# touch_screen
train$touch_screen <- as.factor(train$touch_screen)
test$touch_screen <- as.factor(test$touch_screen)
# wifi
train$wifi <- as.factor(train$wifi)
test$wifi <- as.factor(test$wifi)
# n_cores
train$n_cores <- as.factor(train$n_cores)
test$n_cores <- as.factor(test$n_cores)
# four_g
train$four_g <- as.factor(train$four_g)
test$four_g <- as.factor(test$four_g)
# dual_sim
train$dual_sim <- as.factor(train$dual_sim)
test$dual_sim <- as.factor(test$dual_sim)
# 데이터 스케일링
model <- preProcess(
train,
method = c('range')
)
train2 <- predict(
model,
train
)
model <- preProcess(
test[,1],
method = c('range')
)
test2 <- predict(
model,
test
)
rp <- rpart(
price_range ~ .,
train2
)
rp.p <- predict(
rp,
newdata = test2,
type = 'class'
)
)
set.seed(21)
parts <- sample(
1:nrow(train),
size = 1000
)
sam <- train[parts, ]
caret::confusionMatrix(
rp.p,
sam$price_range,
mode = 'everything'
)
result <- data.frame(
test$id,
rp.p
)
names(result) <- c('id', 'price_range')
write.csv(result, 'result.csv', row.names = FALSE)
> head(result)
id price_range
1 1 3
2 2 3
3 3 2
4 4 3
5 5 1
6 6 3