> stu1 <- data.frame(name = c('Jimin', 'Hyunsoo', 'Sangho', 'Yerim'),
+ year1 = c(100, 70, 80, 60),
+ year2 = c(77, 49, 53, 82))
> stu1
name year1 year2
1 Jimin 100 77
2 Hyunsoo 70 49
3 Sangho 80 53
4 Yerim 60 82
> stu1_long <- pivot_longer(stu1,
+ cols = c('year1', 'year2'),
+ names_to = 'year',
+ values_to = 'math')
> stu1_long
# A tibble: 8 × 3
name year math
<chr> <chr> <dbl>
1 Jimin year1 100
2 Jimin year2 77
3 Hyunsoo year1 70
4 Hyunsoo year2 49
5 Sangho year1 80
6 Sangho year2 53
7 Yerim year1 60
8 Yerim year2 82
> pivot_wider(stu1_long,
+ names_from = year,
+ values_from = math)
# A tibble: 4 × 3
name year1 year2
<chr> <dbl> <dbl>
1 Jimin 100 77
2 Hyunsoo 70 49
3 Sangho 80 53
4 Yerim 60 82
- rep(): 문자나 숫자를 반복적으로 사용할 때 쓰는 함수
stu2 <- data.frame(id = rep(1:3, each = 4), + name = rep(c('Jimin', 'Hyunsoo', 'Sangho'), each = 4), + year = rep(2020:2021, each = 2), + term = rep(1:2, rep = 2), + math = c(86, 66, 67, 93, 97, 63, 58, 89, 95, 65, 64, 60), + eng = c(79, 84, 92, 73, 82, 89, 90, 75, 83, 74, 95, 71)) stu2 id name year term math eng 1 1 Jimin 2020 1 86 79 2 1 Jimin 2020 2 66 84 3 1 Jimin 2021 1 67 92 4 1 Jimin 2021 2 93 73 5 2 Hyunsoo 2020 1 97 82 6 2 Hyunsoo 2020 2 63 89 7 2 Hyunsoo 2021 1 58 90 8 2 Hyunsoo 2021 2 89 75 9 3 Sangho 2020 1 95 83 10 3 Sangho 2020 2 65 74 11 3 Sangho 2021 1 64 95 12 3 Sangho 2021 2 60 71
> stu2 %>%
+ select(-math) %>%
+ pivot_wider(names_from = c('year', 'term'),
+ values_from = 'eng')
# A tibble: 3 × 6
id name `2020_1` `2020_2` `2021_1` `2021_2`
<int> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 Jimin 79 84 92 73
2 2 Hyunsoo 82 89 90 75
3 3 Sangho 83 74 95 71
> stu2 %>%
+ group_by(id, name, year) %>%
+ summarize(avg_math = mean(math)) %>%
+ pivot_wider(names_from = 'year',
+ values_from = avg_math)
`summarise()` has grouped output by 'id', 'name'. You can override using the `.groups` argument.
# A tibble: 3 × 4
# Groups: id, name [3]
id name `2020` `2021`
<int> <chr> <dbl> <dbl>
1 1 Jimin 76 80
2 2 Hyunsoo 80 73.5
3 3 Sangho 80 62
> anyNA(stu1) # 데이터에 NA가 있는지 확인 T/F
[1] TRUE
> table(is.na(stu1$math)) # TRUE 가 결측치
FALSE TRUE
10 2
> ed <- stu1 %>%
+ filter(!is.na(math) & !is.na(eng))
> ed
name class math eng
1 Hwayoung 1 74 76
2 <NA> 2 56 70
3 Sojin 2 57 68
4 Goeun 1 39 63
5 Janghoon 1 55 70
6 Jungjae 2 92 100
7 Kyunghwan 2 71 65
> stu1 %>%
+ # drop_na(math, eng)
+ drop_na()
name class math eng
1 Hwayoung 1 74 76
2 Sojin 2 57 68
3 Goeun 1 39 63
4 Janghoon 1 55 70
5 Jungjae 2 92 100
6 Kyunghwan 2 71 65
> na.omit(stu1)
name class math eng
1 Hwayoung 1 74 76
4 Sojin 2 57 68
6 Goeun 1 39 63
7 Janghoon 1 55 70
11 Jungjae 2 92 100
12 Kyunghwan 2 71 65
> stu2 %>%
+ # drop_na()
+ drop_na(math, eng) %>%
+ group_by(class) %>%
+ summarize(mean(math), mean(eng))
# A tibble: 2 × 3
class `mean(math)` `mean(eng)`
<dbl> <dbl> <dbl>
1 1 56 69.7
2 2 69 75.8
> stu2 %>%
+ summarize(mean(math, na.rm = T)) # 파라미터로 na 값 없애기
# 먼저 함수 파라미터를 확인해서 결측치 제거를 지원하면 사용
mean(math, na.rm = T)
1 63.7
# 결측치를 평균값으로 채우기
> stu2 %>%
+ mutate(math = ifelse(is.na(math), mean(math, na.rm = T), math))
name class math eng gender
1 Hwayoung 1 74.0 76 f
2 <NA> 2 56.0 70 <NA>
3 Sua 3 63.7 77 <NA>
4 Sojin 2 57.0 68 <NA>
5 Minjeong 3 42.0 NA <NA>
6 Goeun 1 39.0 63 <NA>
7 Janghoon 1 55.0 70 m
8 Sunghoon 2 68.0 NA <NA>
9 Jaeyoung 3 63.7 95 <NA>
10 Yeseung 3 83.0 NA <NA>
11 Jungjae 2 92.0 100 <NA>
12 Kyunghwan 2 71.0 65 <NA>
> stu2 %>%
+ fill(gender, .direction = 'down') # 위에 있는 값으로 결측치 채우기
name class math eng gender
1 Hwayoung 1 74 76 f
2 <NA> 2 56 70 f
3 Sua 3 NA 77 f
4 Sojin 2 57 68 f
5 Minjeong 3 42 NA f
6 Goeun 1 39 63 f
7 Janghoon 1 55 70 m
8 Sunghoon 2 68 NA m
9 Jaeyoung 3 NA 95 m
10 Yeseung 3 83 NA m
11 Jungjae 2 92 100 m
12 Kyunghwan 2 71 65 m
> stu2 %>%
+ fill(math)
name class math eng gender
1 Hwayoung 1 74 76 f
2 <NA> 2 56 70 <NA>
3 Sua 3 56 77 <NA>
4 Sojin 2 57 68 <NA>
5 Minjeong 3 42 NA <NA>
6 Goeun 1 39 63 <NA>
7 Janghoon 1 55 70 m
8 Sunghoon 2 68 NA <NA>
9 Jaeyoung 3 68 95 <NA>
10 Yeseung 3 83 NA <NA>
11 Jungjae 2 92 100 <NA>
12 Kyunghwan 2 71 65 <NA>
> stu2 %>%
+ replace_na(list(math = 0, eng = 10)) # 특정값으로 대체하기
name class math eng gender
1 Hwayoung 1 74 76 f
2 <NA> 2 56 70 <NA>
3 Sua 3 0 77 <NA>
4 Sojin 2 57 68 <NA>
5 Minjeong 3 42 10 <NA>
6 Goeun 1 39 63 <NA>
7 Janghoon 1 55 70 m
8 Sunghoon 2 68 10 <NA>
9 Jaeyoung 3 0 95 <NA>
10 Yeseung 3 83 10 <NA>
11 Jungjae 2 92 100 <NA>
12 Kyunghwan 2 71 65 <NA>
> hs <- data.frame(id = 1:10,
+ gender = c('f', 'f', 'f', 'f', 'm', 'm', 'm', 'mm', 'mm', 'm'),
+ age = c(17, 18, 18, 17, 17, 18, 18, 19, 19, 16),
+ math = c(65, 70, 50, 60, 1, 60, 90, 70, 110, 0))
> hs
id gender age math
1 1 f 17 65
2 2 f 18 70
3 3 f 18 50
4 4 f 17 60
5 5 m 17 1
6 6 m 18 60
7 7 m 18 90
8 8 mm 19 70
9 9 mm 19 110
10 10 m 16 0
> new_hs <- hs %>%
+ mutate(gender = ifelse(gender == 'mm', 'm', gender),
+ age = ifelse(age == 16, 17, age),
+ math = ifelse(math >= 0 & math <= 100 & math %% 5 == 0, math, NA))
> new_hs
id gender age math
1 1 f 17 65
2 2 f 18 70
3 3 f 18 50
4 4 f 17 60
5 5 m 17 NA
6 6 m 18 60
7 7 m 18 90
8 8 m 19 70
9 9 m 19 NA
10 10 m 17 0
> new_hs %>%
+ group_by(gender, age) %>%
+ summarize(avg = mean(math, na.rm = T))
`summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
# A tibble: 5 × 3
# Groups: gender [2]
gender age avg
<chr> <dbl> <dbl>
1 f 17 62.5
2 f 18 60
3 m 17 0
4 m 18 75
5 m 19 70
이 글은 패스트캠퍼스 데이터 분석 Master Class의 강의자료 일부를 발췌하여 작성되었습니다.