R - tidyr 패키지

구너탱·2023년 11월 30일
0

R

목록 보기
3/3
post-thumbnail

📖 tidyr

📌 pivot

> stu1 <- data.frame(name = c('Jimin', 'Hyunsoo', 'Sangho', 'Yerim'),
+                    year1 = c(100, 70, 80, 60),
+                    year2 = c(77, 49, 53, 82))
> stu1
     name year1 year2
1   Jimin   100    77
2 Hyunsoo    70    49
3  Sangho    80    53
4   Yerim    60    82

🔹 pivot_longer()

> stu1_long <- pivot_longer(stu1,
+              cols = c('year1', 'year2'),
+              names_to = 'year',
+              values_to = 'math')
> stu1_long
# A tibble: 8 × 3
  name    year   math
  <chr>   <chr> <dbl>
1 Jimin   year1   100
2 Jimin   year2    77
3 Hyunsoo year1    70
4 Hyunsoo year2    49
5 Sangho  year1    80
6 Sangho  year2    53
7 Yerim   year1    60
8 Yerim   year2    82

🔹 pivot_wider()

> pivot_wider(stu1_long,
+             names_from = year,
+             values_from = math)
# A tibble: 4 × 3
  name    year1 year2
  <chr>   <dbl> <dbl>
1 Jimin     100    77
2 Hyunsoo    70    49
3 Sangho     80    53
4 Yerim      60    82
  • rep(): 문자나 숫자를 반복적으로 사용할 때 쓰는 함수
stu2 <- data.frame(id = rep(1:3, each = 4),
+                    name = rep(c('Jimin', 'Hyunsoo', 'Sangho'), each = 4),
+                    year = rep(2020:2021, each = 2),
+                    term = rep(1:2, rep = 2),
+                    math = c(86, 66, 67, 93, 97, 63, 58, 89, 95, 65, 64, 60),
+                    eng = c(79, 84, 92, 73, 82, 89, 90, 75, 83, 74, 95, 71))
stu2
   id    name year term math eng
1   1   Jimin 2020    1   86  79
2   1   Jimin 2020    2   66  84
3   1   Jimin 2021    1   67  92
4   1   Jimin 2021    2   93  73
5   2 Hyunsoo 2020    1   97  82
6   2 Hyunsoo 2020    2   63  89
7   2 Hyunsoo 2021    1   58  90
8   2 Hyunsoo 2021    2   89  75
9   3  Sangho 2020    1   95  83
10  3  Sangho 2020    2   65  74
11  3  Sangho 2021    1   64  95
12  3  Sangho 2021    2   60  71

🔹 응용

> stu2 %>%
+   select(-math) %>%
+   pivot_wider(names_from = c('year', 'term'),
+               values_from = 'eng')
# A tibble: 3 × 6
     id name    `2020_1` `2020_2` `2021_1` `2021_2`
  <int> <chr>      <dbl>    <dbl>    <dbl>    <dbl>
1     1 Jimin         79       84       92       73
2     2 Hyunsoo       82       89       90       75
3     3 Sangho        83       74       95       71

> stu2 %>%
+   group_by(id, name, year) %>%
+   summarize(avg_math = mean(math)) %>%
+   pivot_wider(names_from = 'year',
+               values_from = avg_math)
`summarise()` has grouped output by 'id', 'name'. You can override using the `.groups` argument.
# A tibble: 3 × 4
# Groups:   id, name [3]
     id name    `2020` `2021`
  <int> <chr>    <dbl>  <dbl>
1     1 Jimin       76   80  
2     2 Hyunsoo     80   73.5
3     3 Sangho      80   62  

📌 결측치 처리

🔹 anyNA()

> anyNA(stu1) # 데이터에 NA가 있는지 확인 T/F
[1] TRUE

🔹 is.na()

> table(is.na(stu1$math)) # TRUE 가 결측치

FALSE  TRUE 
   10     2 

🔹 결측치가 있는 행 조건문으로 제거

> ed <- stu1 %>%
+   filter(!is.na(math) & !is.na(eng))
> ed
       name class math eng
1  Hwayoung     1   74  76
2      <NA>     2   56  70
3     Sojin     2   57  68
4     Goeun     1   39  63
5  Janghoon     1   55  70
6   Jungjae     2   92 100
7 Kyunghwan     2   71  65

🔹 drop_na()

> stu1 %>%
+ #  drop_na(math, eng)
+   drop_na()
       name class math eng
1  Hwayoung     1   74  76
2     Sojin     2   57  68
3     Goeun     1   39  63
4  Janghoon     1   55  70
5   Jungjae     2   92 100
6 Kyunghwan     2   71  65

🔹 na.omit()

> na.omit(stu1)
        name class math eng
1   Hwayoung     1   74  76
4      Sojin     2   57  68
6      Goeun     1   39  63
7   Janghoon     1   55  70
11   Jungjae     2   92 100
12 Kyunghwan     2   71  65

🔹 na.rm : 파라미터로 제거

> stu2 %>%
+ #  drop_na()
+   drop_na(math, eng) %>%
+   group_by(class) %>%
+   summarize(mean(math), mean(eng))
# A tibble: 2 × 3
  class `mean(math)` `mean(eng)`
  <dbl>        <dbl>       <dbl>
1     1           56        69.7
2     2           69        75.8

> stu2 %>%
+   summarize(mean(math, na.rm = T)) # 파라미터로 na 값 없애기
# 먼저 함수 파라미터를 확인해서 결측치 제거를 지원하면 사용
  mean(math, na.rm = T)
1                  63.7

# 결측치를 평균값으로 채우기
> stu2 %>%
+   mutate(math = ifelse(is.na(math), mean(math, na.rm = T), math))
        name class math eng gender
1   Hwayoung     1 74.0  76      f
2       <NA>     2 56.0  70   <NA>
3        Sua     3 63.7  77   <NA>
4      Sojin     2 57.0  68   <NA>
5   Minjeong     3 42.0  NA   <NA>
6      Goeun     1 39.0  63   <NA>
7   Janghoon     1 55.0  70      m
8   Sunghoon     2 68.0  NA   <NA>
9   Jaeyoung     3 63.7  95   <NA>
10   Yeseung     3 83.0  NA   <NA>
11   Jungjae     2 92.0 100   <NA>
12 Kyunghwan     2 71.0  65   <NA>

🔹 fill() : 결측치 값 채우기

> stu2 %>%
+   fill(gender, .direction = 'down') # 위에 있는 값으로 결측치 채우기
        name class math eng gender
1   Hwayoung     1   74  76      f
2       <NA>     2   56  70      f
3        Sua     3   NA  77      f
4      Sojin     2   57  68      f
5   Minjeong     3   42  NA      f
6      Goeun     1   39  63      f
7   Janghoon     1   55  70      m
8   Sunghoon     2   68  NA      m
9   Jaeyoung     3   NA  95      m
10   Yeseung     3   83  NA      m
11   Jungjae     2   92 100      m
12 Kyunghwan     2   71  65      m

> stu2 %>%
+   fill(math)
        name class math eng gender
1   Hwayoung     1   74  76      f
2       <NA>     2   56  70   <NA>
3        Sua     3   56  77   <NA>
4      Sojin     2   57  68   <NA>
5   Minjeong     3   42  NA   <NA>
6      Goeun     1   39  63   <NA>
7   Janghoon     1   55  70      m
8   Sunghoon     2   68  NA   <NA>
9   Jaeyoung     3   68  95   <NA>
10   Yeseung     3   83  NA   <NA>
11   Jungjae     2   92 100   <NA>
12 Kyunghwan     2   71  65   <NA>

🔹 replace_na() : 특정값 대체

> stu2 %>%
+   replace_na(list(math = 0, eng = 10)) # 특정값으로 대체하기
        name class math eng gender
1   Hwayoung     1   74  76      f
2       <NA>     2   56  70   <NA>
3        Sua     3    0  77   <NA>
4      Sojin     2   57  68   <NA>
5   Minjeong     3   42  10   <NA>
6      Goeun     1   39  63   <NA>
7   Janghoon     1   55  70      m
8   Sunghoon     2   68  10   <NA>
9   Jaeyoung     3    0  95   <NA>
10   Yeseung     3   83  10   <NA>
11   Jungjae     2   92 100   <NA>
12 Kyunghwan     2   71  65   <NA>

📌 이상치 처리

> hs <- data.frame(id = 1:10,
+                  gender = c('f', 'f', 'f', 'f', 'm', 'm', 'm', 'mm', 'mm', 'm'),
+                  age = c(17, 18, 18, 17, 17, 18, 18, 19, 19, 16),
+                  math = c(65, 70, 50, 60, 1, 60, 90, 70, 110, 0))
> hs
   id gender age math
1   1      f  17   65
2   2      f  18   70
3   3      f  18   50
4   4      f  17   60
5   5      m  17    1
6   6      m  18   60
7   7      m  18   90
8   8     mm  19   70
9   9     mm  19  110
10 10      m  16    0

> new_hs <- hs %>%
+   mutate(gender = ifelse(gender == 'mm', 'm', gender),
+          age = ifelse(age == 16, 17, age),
+          math = ifelse(math >= 0 & math <= 100 & math %% 5 == 0, math, NA))
> new_hs
   id gender age math
1   1      f  17   65
2   2      f  18   70
3   3      f  18   50
4   4      f  17   60
5   5      m  17   NA
6   6      m  18   60
7   7      m  18   90
8   8      m  19   70
9   9      m  19   NA
10 10      m  17    0

> new_hs %>%
+   group_by(gender, age) %>%
+   summarize(avg = mean(math, na.rm = T))
`summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
# A tibble: 5 × 3
# Groups:   gender [2]
  gender   age   avg
  <chr>  <dbl> <dbl>
1 f         17  62.5
2 f         18  60  
3 m         17   0  
4 m         18  75  
5 m         19  70  

이 글은 패스트캠퍼스 데이터 분석 Master Class의 강의자료 일부를 발췌하여 작성되었습니다.

profile
데이터 꿈나물

0개의 댓글