0824 - R

์˜ค๋Š˜ยท2022๋…„ 8์›” 24์ผ
0

A

๋ชฉ๋ก ๋ณด๊ธฐ
1/46

๐ŸŒฑ ๋ฐ์ดํ„ฐ ์ฝ์–ด์˜ค๊ธฐ์™€ ์ €์žฅ

์„ค์น˜

์—‘์…€ ์ฝ์„ ํŒจํ‚ค์ง€์™€ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์„ค์น˜

install.packages("readxl")
library(readxl)

์ฝ๊ธฐ

  • ์—‘์…€
read_excel("์ฝ์„ ํŒŒ์ผ์ด ์žˆ๋Š” ์œ„์น˜")
ex) read_excel("C:/R/excel_exam.xlsx")

  • csv
read.csv("์ฝ์„ ํŒŒ์ผ")
ex) read.csv("C:/R/csv_exam.csv")

ํŒŒ์ผ๋กœ ์ €์žฅ

๋จผ์ € ๊ฐ’์ด ๋„ˆ๋ฌด ๋งŽ์œผ๋‹ˆ๊นŒ 6์—ด ๊นŒ์ง€๋งŒ df์— ์ €์žฅ

๐Ÿƒ ๋ฐฉ๋ฒ• 1
> df <- read.csv("C:/R/csv_exam.csv")[1:6,]
> df
  id class math english science
1  1     1   50      98      50
2  2     1   60      97      60
3  3     1   45      86      78
4  4     1   30      98      58
5  5     2   25      80      65
6  6     2   50      89      98

๐Ÿƒ ๋ฐฉ๋ฒ• 2
> df <- read.csv("C:/R/csv_exam.csv")[1:5, 3:5]
> df
  math english science
1   50      98      50
2   60      97      60
3   45      86      78
4   30      98      58
5   25      80      65

๐Ÿƒ ๋ฐฉ๋ฒ• 3
> df <- read.csv("C:/R/csv_exam.csv")[1:5, -c(1,2)]
> df
  math english science
1   50      98      50
2   60      97      60
3   45      86      78
4   30      98      58
5   25      80      65

์ €์žฅ

> # write.csv(๋ฌด์—‡์„(๊ฐ์ฒด๋ฅผ), ์–ด๋–ค์ด๋ฆ„์œผ๋กœ ์ €์žฅํ•  ๊ฒƒ์ธ๊ฐ€)
> write.csv(df, "csv_test.txt")
> write.csv(df, "csv_text.csv")

์‹คํ–‰์‹œ

ํŒŒ์ผ ์ด๋ฆ„์ด ํ•œ๊ธ€์ธ ๊ฒฝ์šฐ

์•ˆ์ฝํžˆ๋Š” ๊ฒฝ์šฐ๊ฐ€ ์žˆ๋‹ค. ์ด๋Ÿด๋•Œ๋Š” read.csv()๋‚˜ read.table() ํ•จ์ˆ˜ ๋’ค์— fileEncoding="euc-kr" ํ˜น์€ fileEncoding="cp949" ์ด๋ผ๋Š” ์˜ต์…˜์„ ๋„ฃ์–ด๋ณด๊ธธ

read.csv("C:/R/ํ•œ๊ธ€ํŒŒ์ผ์ด๋ฆ„.csv", fileEncoding = "euc-kr")
read.csv("c:/r/ํ•œ๊ธ€ํŒŒ์ผ์ด๋ฆ„.csv", fileEncoding = "cp949")

+) ํŒŒ์ผ์ด ์ง€๊ธˆ ๋‚ด working directory ๋ฐ‘์— ์žˆ๋‹ค๋ฉด ์œ„์ฒ˜๋Ÿผ ์ผ์ผํžˆ ์ƒ์„ธ ๊ฒฝ๋กœ๋ฅผ ์ ์–ด์ฃผ์ง€ ์•Š์•„๋„ ๋œ๋‹ค.

read.csv("C:/R/ํ•œ๊ธ€ํŒŒ์ผ์ด๋ฆ„.csv")
read.csv("ํ•œ๊ธ€ํŒŒ์ผ์ด๋ฆ„.csv")
๋‘˜์€ ๋˜‘๊ฐ™์ด ๋™์ž‘

โˆด๋งŒ์•ฝ ๊ฒฝ๋กœ ์ง€์ • ์•ˆํ•˜๊ณ  ํŒŒ์ผ ์ด๋ฆ„๋งŒ ์จ์„œ ๋ถˆ๋Ÿฌ์˜ค๊ณ  ์‹ถ๋‹ค๋ฉด
์ง€๊ธˆ ๋‚ด getwd()๋ฅผ ํ™•์ธํ•ด ๊ทธ ๋ฐ‘์— ๋ถˆ๋Ÿฌ์˜ฌ ํŒŒ์ผ์„ ๋„ฃ์–ด๋ณด์‹œ๊ธธ

๋ฐ์ดํ„ฐ ์‚ดํŽด๋ณด๊ธฐ

๐Ÿƒ head() : ์•ž์—์„œ ๋ช‡ ์ค„ ์ถœ๋ ฅํ•  ๊ฒƒ์ธ๊ฐ€?
			default๊ฐ’์€ 6
			[์‚ฌ์šฉ์˜ˆ์‹œ] head(df)
					  head(df, 2) -- ์•ž์—์„œ ๋ถ€ํ„ฐ 2์ค„ ์ถœ๋ ฅ

๐Ÿƒ tail() : ๋’ค์—์„œ๋ถ€ํ„ฐ ๋ช‡ ์ค„ ์ถœ๋ ฅํ•  ๊ฒƒ์ธ๊ฐ€?
			default๊ฐ’์€ 6
            [์‚ฌ์šฉ์˜ˆ์‹œ] tail(df)
            		  tail(df, 3) -- ๋’ค์—์„œ 3์ค„ ์ถœ๋ ฅ

๐Ÿƒ str() : ๋ฐ์ดํ„ฐ ์†์„ฑ ํ™•์ธ
			[์‚ฌ์šฉ์˜ˆ์‹œ]
            > str(df)
            tibble [20 ร— 5] (S3: tbl_df/tbl/data.frame)
             $ id     : num [1:20] 1 2 3 4 5 6 7 8 9 10 ...
             $ class  : num [1:20] 1 1 1 1 2 2 2 2 3 3 ...
             $ math   : num [1:20] 50 60 45 30 25 50 80 90 20 50 ...
             $ english: num [1:20] 98 97 86 98 80 89 90 78 98 98 ...
             $ science: num [1:20] 50 60 78 58 65 98 45 25 15 45 ...
            
            ๊ฐ ์ปฌ๋Ÿผ ๋ช… : ์ปฌ๋Ÿผ ํƒ€์ž… [๋ฐ์ดํ„ฐ๊ฐ€ ๋ช‡๊ฐœ์žˆ๋Š”๊ฐ€] ์˜ˆ์‹œ๋กœ ๋ณด์—ฌ์ฃผ๋Š” ๊ฐ’
            
๐Ÿƒ summary() : ์š”์•ฝ ํ†ต๊ณ„ ์ถœ๋ ฅ
				[์‚ฌ์šฉ์˜ˆ์‹œ]
                > summary(df)
                       id            class        math          english    
                 Min.   : 1.00   Min.   :1   Min.   :20.00   Min.   :56.0  
                 1st Qu.: 5.75   1st Qu.:2   1st Qu.:45.75   1st Qu.:78.0  
                 Median :10.50   Median :3   Median :54.00   Median :86.5  
                 Mean   :10.50   Mean   :3   Mean   :57.45   Mean   :84.9  
                 3rd Qu.:15.25   3rd Qu.:4   3rd Qu.:75.75   3rd Qu.:98.0  
                 Max.   :20.00   Max.   :5   Max.   :90.00   Max.   :98.0  
                    science     
                 Min.   :12.00  
                 1st Qu.:45.00  
                 Median :62.50  
                 Mean   :59.45  
                 3rd Qu.:78.00  
                 Max.   :98.00
                 
                 ์ง„์งœ ์š”์•ฝ์ด๋‹ค. ํ•ด๋‹น ์ปฌ๋Ÿผ์˜ ์ตœ์†Œ๊ฐ’, ์ตœ๋Œ€๊ฐ’, ํ‰๊ท ๊ฐ’๋“ฑ์„ ํ™•์ธ ๊ฐ€๋Šฅ
                 

๐ŸŒฑ ๊ฐ€๊ณต

dplyt๋กœ ๊ฐ€๊ณต

# ํŒจํ‚ค์ง€๊ฐ€ ์—†๋‹ค๋ฉด ํŒจํ‚ค์ง€ ์„ค์น˜๋ถ€ํ„ฐ 
# install.packages("dplyr")

# dplyr ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์‚ฌ์šฉํ•˜๊ฒ ๋‹ค ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
library(dplyr)

๊ฐ€์žฅ ๋งŽ์ด ์“ฐ์ด๋Š” 5๊ฐœ๋ณด๊ธฐ

1. filter() : ํ–‰ ์ถ”์ถœ

๐Ÿƒ filter(.data, ...)
          .data : ์–ด๋–ค ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„์— ๋Œ€ํ•˜์—ฌ
          ... : ์–ด๋–ค ์กฐ๊ฑด์„ ๊ฑธ ๊ฒƒ์ธ๊ฐ€

          [์‚ฌ์šฉ ์˜ˆ์‹œ]
          > # df์—์„œ class == 1์ธ ๊ฒƒ๋“ค๋งŒ ๋ณด๊ฒ ๋‹ค
          > filter(df, class == 1)
          # A tibble: 4 ร— 5
               id class  math english science
            <dbl> <dbl> <dbl>   <dbl>   <dbl>
          1     1     1    50      98      50
          2     2     1    60      97      60
          3     3     1    45      86      78
          4     4     1    30      98      58
         
          
          ์œ„๋Š” RStudio `help`์—์„œ ์„ค๋ช…ํ•˜๋Š” filter์˜ ์‚ฌ์šฉ๋ชจ์Šต
          ํ•˜์ง€๋งŒ ์•„๋ž˜์ฒ˜๋Ÿผ ํŒŒ์ดํ”„๋ฅผ ์‚ฌ์šฉํ•œ ๋ชจ์Šต์„
          ๋” ๋งŽ์ด ์‚ฌ์šฉ ํ•œ๋‹ค๊ณ  ํ•จ

๐Ÿงƒ ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ %>% filter(์กฐ๊ฑด)

          [์‚ฌ์šฉ ์˜ˆ์‹œ]
          > # df์—์„œ %>% filter(class๊ฐ€ 1์ธ ๊ฒƒ๋งŒ)
          > df %>% filter(class == 1)
          # A tibble: 4 ร— 5
               id class  math english science
            <dbl> <dbl> <dbl>   <dbl>   <dbl>
          1     1     1    50      98      50
          2     2     1    60      97      60
          3     3     1    45      86      78
          4     4     1    30      98      58
          
          > or and ๋น„๊ต ๋ชจ๋‘ ๊ฐ€๋Šฅ
          > # 1๋ฐ˜์ด๋ฉด์„œ ์ˆ˜ํ•™์ ์ˆ˜ 50์ด์ƒ
          > df %>% filter(class == 1 & math >=50)
          # A tibble: 2 ร— 5
               id class  math english science
            <dbl> <dbl> <dbl>   <dbl>   <dbl>
          1     1     1    50      98      50
          2     2     1    60      97      60
          
          ># ํ•˜๋‚˜์˜ ์ปฌ๋Ÿผ์—์„œ ์ค‘๋ณต๋œ ์—ฌ๋Ÿฌ๊ฐœ๋ฅผ ๋ฝ‘๊ณ  ์‹ถ์„๋•Œ
          ># %in% c() ๋ฅผ ์‚ฌ์šฉํ•˜๊ธฐ๋„ ํ•จ
          ># ex) 1, 3, 5๋ฐ˜์— ํ•ด๋‹นํ•˜๋ฉด ์ถ”์ถœ
          >df %>% filter(class == 1 | class == 3 | class ==5)
          >df %>% filter(class %in% c(1, 3, 5))
          ># ์œ„ ๋‘๊ฐœ๋Š” ๊ฐ™์€ ๊ฒฐ๊ณผ ๊ฐ’

2. select() : ์—ด ์ถ”์ถœ

๐Ÿƒ select(.data, -)
          .data : ์–ด๋–ค ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„์— ๋Œ€ํ•˜์—ฌ
          - : ์–ด๋–ค ์กฐ๊ฑด์„ ๊ฑธ ๊ฒƒ์ธ๊ฐ€
          [์‚ฌ์šฉ์˜ˆ์‹œ]
          > select(df, math)
          # A tibble: 20 ร— 1
              math
             <dbl>
           1    50
           2    60
           3    45
           4    30
           .....
          
 
pipe ๋ผ์ธ ์‚ฌ์šฉ์‹œ
๐Ÿงƒ ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ %>% select(์ถ”์ถœํ•  ์—ด)
      [์‚ฌ์šฉ ์˜ˆ์‹œ]
      > df %>% select(math)
      # A tibble: 20 ร— 1
          math
         <dbl>
       1    50
       2    60
       3    45
       ...

      > # ์—ฌ๋Ÿฌ๊ฐœ ๋ฝ‘๊ธฐ
      > df %>% select(math, english)
      # A tibble: 20 ร— 2
          math english
         <dbl>   <dbl>
       1    50      98
       2    60      97
       3    45      86
       4    30      98
       5    25      80
       
      ># ํŠน์ • ์—ด ์ œ์™ธ
      > df %>% select(-math)
      # A tibble: 20 ร— 4
            id class english science
         <dbl> <dbl>   <dbl>   <dbl>
       1     1     1      98      50
       2     2     1      97      60
       3     3     1      86      78
       4     4     1      98      58
       5     5     2      80      65
       
      ># ํ•„ํ„ฐ์™€ ํ•จ๊ป˜ ์“ฐ๊ธฐ
      ># filter๋กœ ๋จผ์ € ๊ฑธ๋Ÿฌ๋‚ด๊ณ  select
      ># 1๋ฐ˜์ธ์• ๋“ค ์ˆ˜ํ•™์„ฑ์ ๋ณด์—ฌ์ฃผ๊ธฐ
      > df %>% filter(class == 1) %>% select(math)
      # A tibble: 4 ร— 1
         math
        <dbl>
      1    50
      2    60
      3    45
      4    30
      
      ># 1๋ฐ˜์— ์ˆ˜ํ•™์„ฑ์ ์ด 50์ธ ์• ๋“ค์˜ ์˜์–ด์™€ ๊ณผํ•™ ์—ด์„ ๋ณด์—ฌ๋‹ฌ๋ผ
      > df %>% filter(class == 1 & math >= 50) %>% select(english, science)
      # A tibble: 2 ร— 2
        english science
          <dbl>   <dbl>
      1      98      50
      2      97      60
      

3. ์ •๋ ฌ

๐Ÿงƒ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ %>% arrange(ํ–‰)

df %>% arrange(math) ---------- ์˜ค๋ฆ„์ฐจ์ˆœ
df %>% arrange(desc(math)) ---- ๋‚ด๋ฆผ์ฐจ์ˆœ

	[์‚ฌ์šฉ์˜ˆ์‹œ]
    > # ๋ฐ˜์ด 2,3์ธ๋ฐ  ์ˆ˜ํ•™์ ์ˆ˜๊ฐ€ 50์ด์ƒ์ธ๋ฐ ํ•™์ƒ๋“ค์˜
    > # ์˜์–ด์™€ ๊ณผํ•™์ ์ˆ˜๋ฅผ
    > # ์˜์–ด๋Š” ์˜ค๋ฆ„์ฐจ์ˆœ, ๊ณผํ•™์€ ๋‚ด๋ฆผ์ฐจ์ˆœ์œผ๋กœ ์ •๋ ฌํ•ด ๋ณด์—ฌ์ฃผ์‹œ์˜ค
    > df %>% filter(class == c(2,3) & math >= 50 )%>% arrange(science, desc(math)) 
    # A tibble: 2 ร— 5
         id class  math english science
      <dbl> <dbl> <dbl>   <dbl>   <dbl>
    1     7     2    80      90      45
    2    10     3    50      98      45

์ƒˆ๋กœ์šด ์—ด(๋ณ€์ˆ˜) ์ถ”๊ฐ€

๐Ÿƒ mutate : ์žˆ๋Š” ๊ฐ’์„ ๋ณ€ํ˜•ํ•ด์„œ ์ถ”๊ฐ€ํ•˜๊ฒ ๋‹ค.
(์žˆ๋˜ ๊ฐ’ ๋ณ€ํ˜•์ด๋ผ ํŒŒ์ƒ๋ณ€์ˆ˜๋ผ๊ณ ๋„ ํ•จ)

๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ %>% mutate(์—ด์ด๋ฆ„ = ๋ณ€ํ˜• ๊ฐ’)
๋งŒ์•ฝ ์—ด ์ด๋ฆ„์„ ์ง€์ • ์•ˆํ•œ๋‹ค๋ฉด '๋ณ€ํ˜• ๊ฐ’'์ด ๊ทธ๋Œ€๋กœ ์—ด์ด๋ฆ„์ด ๋œ๋‹ค.

[์‚ฌ์šฉ์˜ˆ์‹œ]
> # ์ˆ˜ํ•™์ ์ˆ˜๋ฅผ 10์ ์”ฉ ์˜ฌ๋ ค ์ƒˆ๋กœ์šด ์—ด๋กœ ์ถ”๊ฐ€ํ•˜๊ธฐ
> df %>% mutate(tot = math + english + science)
# A tibble: 20 ร— 6
      id class  math english science   tot
   <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>
 1     1     1    50      98      50   198
 2     2     1    60      97      60   217
 ...
 
># ์—ด์ด๋ฆ„ ์ง€์ • ์•ˆํ•ด์คฌ์„ ๋•Œ
> df %>% mutate(math + english + science)
# A tibble: 20 ร— 6
      id class  math english science `math + english + science`
   <dbl> <dbl> <dbl>   <dbl>   <dbl>                      <dbl>
 1     1     1    50      98      50                        198
 2     2     1    60      97      60                        217 
...

> # ๋‘ ๊ฐœ์˜ ์—ด์„ ๋™์‹œ์— ๋งŒ๋“ค์–ด ์ค„ ์ˆ˜๋„ ์žˆ์Œ
> df %>% mutate(tot = math + english + science,
+               mean = (math + english + science)/3)
# A tibble: 20 ร— 7
      id class  math english science   tot  mean
   <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl> <dbl>
 1     1     1    50      98      50   198  66  
 2     2     1    60      97      60   217  72.3
...

># ๋จผ์ € ๊ณ„์‚ฐ๋œ ๊ฐ’์„ ๋’ค์— ์‚ฌ์šฉํ•  ์ˆ˜ ๋„ ์žˆ์Œ
> df %>% mutate(tot = math + english + science,
+               mean = tot/3)
# A tibble: 20 ร— 7
     id class  math english science   tot  mean
   <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl> <dbl>
 1     1     1    50      98      50   198  66 
 ... (์˜ค๋ฅ˜์—†์ด ์‹คํ–‰๋˜๋Š” ๋ชจ์Šต)

+) ifelse

ifelse(์กฐ๊ฑด, ์ฐธ์ผ๋•Œ ์ถœ๋ ฅ์–ด, ๊ฑฐ์ง“์ผ๋•Œ ์ถœ๋ ฅ์–ด)

># ๋งŒ์•ฝ ์ˆ˜ํ•™์ด 50์ ์ด์ƒ์ด๋ผ๋ฉด pass, ์•„๋‹ˆ๋ผ๋ฉด fail ์—ด ์ถ”๊ฐ€
> df %>% mutate(test = ifelse(math >= 50, "Pass", "Fail"))
# A tibble: 20 ร— 6
      id class  math english science test 
   <dbl> <dbl> <dbl>   <dbl>   <dbl> <chr>
 1     1     1    50      98      50 Pass 
 2     2     1    60      97      60 Pass 
 3     3     1    45      86      78 Fail 
 4     4     1    30      98      58 Fail 
 5     5     2    25      80      65 Fail 

๐ŸŒฑ ์ง‘๋‹จ๋ณ„๋กœ ์š”์•ฝํ•˜๊ธฐ

์š”์•ฝ ํ†ต๊ณ„๋Ÿ‰ ํ•จ์ˆ˜

  • mean() : ํ‰๊ท 
  • sd() : ํ‘œ์ค€ํŽธ์ฐจ
  • sum() : ํ•ฉ๊ณ„
  • median() : ์ค‘์•™๊ฐ’
  • min() : ์ตœ์†Ÿ๊ฐ’
  • max() : ์ตœ๋Œ“๊ฐ’
  • n() : ๋นˆ๋„
># ์ˆ˜ํ•™์ ์ˆ˜ ํ‰๊ท  ๋‚ด๊ธฐ. ์•„๋ž˜ ๋‘˜์€ ๊ฐ™์€ ๊ฒฐ๊ณผ
mean(df$math)
df %>% summarise(mathM = mean(math))


># ์ƒˆ๋กœ์šด ์นผ๋Ÿผ, ํ‰๊ท  ์ฃผํ–‰์—ฐ๋น„ ์นผ๋Ÿผ์„ ๋งŒ๋“ค์–ด ์ฃผ์„ธ์š”
library(ggplot2)
mpg <- mpg %>% select(manufacturer, model, displ, year, cty, hwy) %>%
  mutate(ํ‰๊ท ์ฃผํ–‰์—ฐ๋น„ = (cty+hwy)/2) %>% arrange(desc(ํ‰๊ท ์ฃผํ–‰์—ฐ๋น„))

ํŠน์ • ์ง‘๋‹จ์œผ๋กœ ๋ฌถ์„ ๋•Œ

group_by
: ~๋ณ„ ํ‰๊ท ์„ ๊ตฌํ•˜์‹œ์˜ค ์™€ ๊ฐ™์€ ์กฐ๊ฑด์ผ๋•Œ ์‚ฌ์šฉ
: ํŠน์ • ์กฐ๊ฑด์„ ๋งŒ์กฑ์‹œํ‚ค๋Š” ๊ฐ’๋“ค์„ ๊ทธ๋ฃน์ง€์–ด ๋ฆฌํ„ดํ•˜๋Š” ๊ธฐ๋Šฅ์„ ๊ฐ€์ง„๋‹ค

># ์ œ์กฐ์‚ฌ๋ณ„ ๋„์‹œ ์ฃผํ–‰์—ฐ๋น„์˜ ํ‰๊ท ์€?
mpg %>% group_by(manufacturer) %>% summarise(์—ฐ๋น„ํ‰๊ท  = mean(cty))

># ์ค„์„ธ์šฐ๊ธฐ
mpg %>% group_by(manufacturer) %>% summarise(์—ฐ๋น„ํ‰๊ท  = mean(cty)) %>% arrange(์—ฐ๋น„ํ‰๊ท )

># ์ œ์กฐ์‚ฌ์™€ ๋ชจ๋ธ ๋ณ„ ๋„์‹œ์ฃผํ–‰์—ฐ๋น„์˜ ํ‰๊ท ์„ ์ค„์„ธ์›Œ์„œ
mpg %>% group_by(manufacturer, model) %>% 
  summarise(์—ฐ๋น„ํ‰๊ท  = mean(cty)) %>% arrange(์—ฐ๋น„ํ‰๊ท )

dataframe ํ•ฉ์น˜๊ธฐ

join

๐Ÿƒ left_join(๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„1, ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„2, by="๊ธฐ์ค€ ์—ด") : ์™ผ์ชฝํ…Œ์ด๋ธ”์„ ๊ธฐ์ค€์œผ๋กœ ๊ธฐ์ค€์—ด(by)์ด ๊ฒน์น˜๋Š” ๊ฒƒ๋งŒ ์ถ”์ถœ

# ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
# ์ค‘๊ฐ„๊ณ ์‚ฌ
> test3 <- data.frame(id = c(1, 2, 3, 4, 5), 
                         midterm = c(60, 80, 70, 90, 85))

># ๊ธฐ๋ง๊ณ ์‚ฌ
> test4 <-  data.frame(id = c(6, 7, 8, 9, 10),
                     final = c(70, 83, 65, 95, 80))

> left_join(test3, test4, by = "id")
  id midterm final
1  1      60    NA
2  2      80    NA
3  3      70    NA
4  4      90    NA
5  5      85    NA

๐Ÿƒ right_join(๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„1, ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„2, by="๊ธฐ์ค€ ์—ด") : ์˜ค๋ฅธ์ชฝ ํ…Œ์ด๋ธ”์„ ๊ธฐ์ค€์œผ๋กœ ๊ธฐ์ค€์—ด(by)์ด ๊ฒน์น˜๋Š” ๊ฒƒ๋งŒ ์ถ”์ถœ

> right_join(test3, test4, by = 'id')
  id midterm final
1  6      NA    70
2  7      NA    83
3  8      NA    65
4  9      NA    95
5 10      NA    80

๐Ÿƒ bind_rows(dataframe1, dataframe2)

> all <-bind_rows(test3, test4)
> all
   id midterm final
1   1      60    NA
2   2      80    NA
3   3      70    NA
4   4      90    NA
5   5      85    NA
6   6      NA    70
7   7      NA    83
8   8      NA    65
9   9      NA    95
10 10      NA    80

๐Ÿƒ full_join(๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„1, ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„2, by="๊ธฐ์ค€ ์—ด") : ๋ชจ๋“  ํ…Œ์ด๋ธ” ๊ฐ’ ์ถ”์ถœ

># ์–ด๋А์—ด์„ ๊ธฐ์ค€์œผ๋กœ ํ• ์ง€ ์ •ํ•˜์ง€ ์•Š์œผ๋‹ˆ๊นŒ
># ์•Œ์•„์„œ "id"๋ฅผ ๊ธฐ์ค€์œผ๋กœ Joining ํ•ด์คŒ
> full_join(test3, test4)
Joining, by = "id"
   id midterm final
1   1      60    NA
2   2      80    NA
3   3      70    NA
4   4      90    NA
5   5      85    NA
6   6      NA    70
7   7      NA    83
8   8      NA    65
9   9      NA    95
10 10      NA    80

๐Ÿƒ inner_join(๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„1, ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„2, by="๊ธฐ์ค€ ์—ด") : ๊ธฐ์ค€์—ด์„ ์ค‘์‹ฌ์œผ๋กœ ๋‘ ํ…Œ์ด๋ธ”์— ๊ฐ™์ด ์กด์žฌํ•˜๋Š” ๋ฐ์ดํ„ฐ๋ฅผ ์ถ”์ถœ

> inner_join(test3, test4)
Joining, by = "id"
[1] id      midterm final  
<0 ํ–‰> <๋˜๋Š” row.names์˜ ๊ธธ์ด๊ฐ€ 0์ž…๋‹ˆ๋‹ค>
># ๊ธฐ์ค€์—ด์„ ๋ช…์‹œํ•ด์ฃผ์ง€ ์•Š์ž ์•Œ์•„์„œ id๋ฅผ ๊ธฐ์ค€์—ด๋กœ ์žก์•˜์ง€๋งŒ
># ๊ฒน์น˜๋Š” ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์–ด ๊ฐ’์ด ๋‚˜์˜ค์ง€ ์•Š์Œ


> inner_join(test1, test2)
Joining, by = "id"
  id midterm final
1  1      60    70
2  2      80    83
3  3      70    65
4  4      90    95
5  5      85    80
># ๊ธฐ์ค€์—ด์ธ "id" ๊ฐ€ ๊ฒน์น˜๋Š” ๋ชจ๋“  ๊ฐ’๋“ค์„ ๋ณด์—ฌ์ค€๋‹ค.

ํ–‰ ์ด๋ฆ„ ๋ฐ”๊พธ๊ธฐ ํ•œ๋ฒˆ ๋”

  1. colnames(test1)=c('id',"test")
> test1
  id midterm
1  1      60
2  2      80
3  3      70
4  4      90
5  5      85

> colnames(test1)=c('id',"test") 
> test1
  id test
1  1   60
2  2   80
3  3   70
4  4   90
5  5   85

> colnames(test1)[2]= "test"
> test1
  id test
1  1   60
2  2   80
3  3   70
4  4   90
5  5   85
  1. rename(๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„1, ์ƒˆ๋ณ€์ˆ˜๋ช… = ๊ธฐ์กด๋ณ€์ˆ˜๋ช…)
    or
    ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„1 %>% rename(์ƒˆ๋ณ€์ˆ˜๋ช… = ๊ธฐ์กด๋ณ€์ˆ˜๋ช…)
> test1 %>% rename(midterm = te)
  id midterm
1  1      60
2  2      80
3  3      70
4  4      90
5  5      85

+) View()

๋ทฐ์–ด ์ฐฝ์—์„œ ๋ฐ์ดํ„ฐ ํ™•์ธํ•˜๊ธฐ

View(test2)


merge

  • merge(๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„1, ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„2, by = "๊ธฐ์ค€์ถ•")
    ๋‘ ๋ฐ์ดํ„ฐ set ๋น„๊ตํ•ด ๊ณตํ†ต๋˜๋Š” ๊ฒƒ๋งŒ ๋ฝ‘์•„์ค€๋‹ค.
> merge(test1, test2, by="id")
  id midterm final
1  1      60    70
2  2      80    83
3  3      70    65
4  4      90    95
5  5      85    80

> merge(test3, test4, by="id")
[1] id      midterm final  
<0 ํ–‰> <๋˜๋Š” row.names์˜ ๊ธธ์ด๊ฐ€ 0์ž…๋‹ˆ๋‹ค>



># ๋งŒ์•ฝ ๊ณตํ†ต๋˜์ง€ ์•Š๋Š” ๋ถ€๋ถ„๋„ ์‚ด๋ฆฌ๊ณ  ์‹ถ๋‹ค๋ฉด
># all.์‚ด๋ฆฌ๊ณ ์‹ถ์€ ์ถ• = TRUE
> merge(test3, test4, by="id", all.x = T)
  id midterm final
1  1      60    NA
2  2      80    NA
3  3      70    NA
4  4      90    NA
5  5      85    NA
> merge(test3, test4, by="id", all.y = T)
  id midterm final
1  6      NA    70
2  7      NA    83
3  8      NA    65
4  9      NA    95
5 10      NA    80



>#๋ชจ๋‘ ์‚ด๋ฆฌ๊ณ  ์‹ถ์œผ๋ฉด all = TRUE
> merge(test3, test4, by="id", all = T)
   id midterm final
1   1      60    NA
2   2      80    NA
3   3      70    NA
4   4      90    NA
5   5      85    NA
6   6      NA    70
7   7      NA    83
8   8      NA    65
9   9      NA    95
10 10      NA    80

0๊ฐœ์˜ ๋Œ“๊ธ€