๐Ÿ’ฏ 11/27 R ์ •๋ฆฌ

๊น€ํƒœ์ค€ยท2022๋…„ 12์›” 4์ผ
1

R-Studio

๋ชฉ๋ก ๋ณด๊ธฐ
5/5

๋ฐ์ดํ„ฐ ๋ถ„์„ ํ”„๋กœ์ ํŠธ

ํ•œ๊ตญ๋ณด๊ฑด์‚ฌํšŒ์—ฐ๊ตฌ์›์—์„œ ๊ฐ€๊ตฌ์˜ ๊ฒฝ์ œํ™œ๋™์„ ์—ฐ๊ตฌํ•ด ์ •์ฑ… ์ง€์›์„ ์œ„ํ•ด ๋ฐœ๊ฐ„ํ•˜๋Š” ์กฐ์‚ฌ ์ž๋ฃŒ(Koweps_hpc10_2015_beta1.sav : SPSS ์ „์šฉํŒŒ์ผ)

์ค€๋น„ ๋‹จ๊ณ„

  • ๋ฐ์ดํ„ฐ ์ค€๋น„, ํŒจํ‚ค์ง€ ์„ค์น˜ ๋ฐ ๋กœ๋“œ, ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
library(dplyr)
library(ggplot2)
library(ggthemes)
library(haven)
library(readxl)
library(reshape)
df <- read_sav(file = 'Koweps_hpc10_2015_beta1.sav', encoding = 'UTF-8')
str(df)
  • ๋ฐ์ดํ„ฐ ๋ณ€์ˆ˜๋ช… ๋ณ€๊ฒฝ
df <- rename(df, 
             c(h10_g3 = 'sex',
               h10_g4 = 'birth',
               h10_g10 = 'marriage', 
               h10_g11 = 'religion', 
               p1002_8aq1 = 'income',
               h10_eco9 = 'code_job', 
               h10_reg7 = 'code_religion'))
  • ๋‚จ๋…€ ์„ฑ๋น„์œจ ๊ฒ€ํ†  sex๊ฐ€ 1์ด๋ฉด male, 0์ด๋ฉด female
df$sex <- ifelse(df$sex == 1, 'male','female')
table(df$sex)
# ๊ฒฐ์ธก์น˜ ์กด์žฌํ•˜๋Š”์ง€ ๊ฒ€ํ† 
table(is.na(df$sex))

๋ณ€์ˆ˜ ๊ฒ€ํ†  ๋ฐ ์ „์ฒ˜๋ฆฌ

  • ์›”๊ธ‰
    ๊ธ‰์—ฌ๋ฅผ 0์—์„œ 10000๊นŒ์ง€๋งŒ ํ‘œ์‹œ
    ๊ธ‰์—ฌ๊ฐ€ 0๋˜๋Š” 9999๋ฉด NA๋กœ ๋ณ€๊ฒฝ
class(df$income)
summary(df$income)
qplot(df$income)
qplot(df$income + xlim(0,1000))
table(is.na(df$income))
df$income <- ifelse(df$income %in% c(0, 9999), NA, df$income)
table(is.na(df$income))
  • ๋ณ€์ˆ˜๊ฐ„์˜ ๊ด€๊ณ„
    ์„ฑ๋ณ„์— ์›”๊ธ‰ ํ‰๊ท  ๋งŒ๋“ค๊ธฐ
    ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ
sex_income <- df %>% filter(!is.na(income)) %>% group_by(sex) %>% summarise(mean_income = mean(income))
ggplot(data = sex_income, aes(x=sex, y=mean_income) + geom_col()
  • ๋‚˜์ด์— ๋”ฐ๋ฅธ ์›”๊ธ‰์˜ ์ฐจ์ด
    ์ฒ˜๋ฆฌ ๊ทœ์ •
    birth์˜ ๋…„๋„๋Š” 1900~2014 ์‚ฌ์ด์˜ ๊ฐ’
    birth๊ฐ€ 9999 ๊ฐ’์ด๋ฉด ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ
    ํŒŒ์ƒ ๋ณ€์ˆ˜(๋‚˜์ด, age) ์ƒ์„ฑ
class(df$birth)
df$birth <- ifelse(df$birth == 9999, NA, df$birth)
df <- df %>% mutate(age = 2023 - df$birth)
table(is.na(df$age))
age_income <- df %>% filter(!is.na(income)) %>% group_by(age) %>% summarise(mean_income = mean(income))
ggplot(data = age_income, aes(x=age, y = mean_income)) + geom_line()
  • ์—ฐ๋ น๋Œ€์— ๋”ฐ๋ฅธ ์›”๊ธ‰ ์ฐจ์ด
df <- df %>% mutate(age_group = ifelse(df$age < 30, 'young', ifelse(df$age < 60, 'middle', 'old')))
table(df$age)
table(is.na(df$age_group))

age_group_income <- df %>% filter(!is.na(income)) %>% group_by(age_group) %>% summarise(mean_income = mean(income))
ggplot(data = age_group_income, aes(x=age_group, y=mean_income)) + geom_col()
# ์ˆœ์„œ๋Œ€๋กœ ์ถœ๋ ฅ
ggplot(data = age_group_income, aes(x=age_group, y=mean_income)) + geom_col() + scale_x_discrete(limits = c('young', 'middle', 'old'))
  • ์ง์—…๋ณ„ ๊ธ‰์—ฌ ํ‰๊ท  ๊ณ„์‚ฐ
    job์™€ income์ด NA๊ฐ€ ์•„๋‹Œ ํ–‰์„ ํ•„ํ„ฐ๋ง
    ์ง์—…๋ณ„๋กœ ์ˆ˜์ž…์˜ ํ‰๊ท ์ด ๋†’์€ 10๊ฐœ์˜ ์ง์—…์„ ์„ ํƒํ•˜๊ณ , ๊ทธ๋ž˜ํ”„๋ฅผ ์™„์„ฑ
    ์ง์—…๋ณ„๋กœ ์ˆ˜์ž…์˜ ํ‰๊ท ์ด ๋‚ฎ์€ 10๊ฐœ์˜ ์ง์—…์„ ์„ ํƒํ•˜๊ณ , ๊ทธ๋ž˜ํ”„๋ฅผ ์™„์„ฑ
occupation <- read_excel('Koweps_Codebook.xlsx', col_names = T, sheet = 2)
head(occupation)
dim(occupation)
df$code_job
df <- left_join(df, occupation, id = 'code_job')
df %>% filter(!is.na(code_job)) %>% select(code_job, job) %>% head(10)

job_income <- df %>% filter(!is.na(job) & !is.na(income)) %>% group_by(job) %>% summarise(mean_income = mean(income)) 

head(job_income)
ggplot(data = job_income, aes(x=job, y=mean_income)) + geom_col() 
job_income_desc <- job_income %>% arrange(desc(mean_income)) %>% head(10)
job_income_asc <- job_income %>% arrange(mean_income) %>% head(10)
?
ggplot(data = job_income_desc, aes(x=reorder(job, mean_income), y=mean_income)) + geom_col() + coord_flip()
ggplot(data = job_income_asc, aes(x=reorder(job, mean_income), y=mean_income)) + geom_col() + coord_flip() + ylim(0, 580)
  • ์„ฑ๋ณ„ ์ง์—…์˜ ๋นˆ๋„
    ๋‚จ์ž์™€ ์—ฌ์ž๊ฐ€ ๋งŽ์ด ๊ฐ–๋Š” ์ง์—…์˜ ์ข…๋ฅ˜๊ฐ€ ๋ฌด์—‡์ธ์ง€ ์กฐ์‚ฌํ•˜์„ธ์š”.
    ๊ฐ๊ฐ ์ƒ์œ„ 10๊ฐœ์˜ ์ง์—…์„ ์ฐพ๊ณ , ๊ฐ ์ง์—…์˜ ์ข…์‚ฌ์ž ์ˆ˜๋ฅผ ๊ตฌํ•˜๊ณ  ์ฐจํŠธ๋ฅผ ์™„์„ฑํ•˜์„ธ์š”.
job_male <- df %>% filter(!is.na(job) & sex == 'male') %>% group_by(job) %>% summarise(n = n()) %>% arrange(desc(n)) %>% head(10)
job_male

job_female <- df %>% filter(!is.na(job) & sex == 'female') %>% group_by(job) %>% summarise(n = n()) %>% arrange(desc(n)) %>% head(10)
job_female

coord_flip() ํ•จ์ˆ˜๋กœ ๋ง‰๋Œ€๊ทธ๋ž˜ํ”„ y์ถ• ๊ธฐ์ค€์œผ๋กœ ์ƒ์„ฑ!
ggplot(data = job_male, aes(x=reorder(job, n), y = n)) + geom_col() + coord_flip()
ggplot(data = job_female, aes(x=reorder(job, n), y = n)) + geom_col() + coord_flip()

R ๊ณต๋ถ€ ํ›„๊ธฐ

๋ฌผ๋ก , ํ•™๊ต ์‹œํ—˜์„ ์œ„ํ•ด ๊ณต๋ถ€๋ฅผ ํ–ˆ์ง€๋งŒ ๊ธฐ๋ณธ์ ์ธ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ, ์‹œ๊ฐํ™” ๊ณผ์ • ๋“ฑ์— ๋Œ€ํ•ด ํ•™์Šตํ•  ์ˆ˜ ์žˆ์—ˆ๊ณ  1์ค„ ๋‹จ์œ„๋กœ ์‹คํ–‰ํ•œ๋‹ค๋Š” ์ ์—์„œ ํŒŒ์ด์ฌ์— ๋น„ํ•ด ๋ถˆํŽธํ–ˆ๋˜ ์ ์ด ๋งŽ์•˜๋‹ค.
๊ทธ๋ž˜๋„ ๋•๋ถ„์— ๊ทธ๋™์•ˆ ์ž˜์€ ๋ชฐ๋ž๋˜ R ํ•จ์ˆ˜๋“ค์„ ์ง์ ‘ ๋‹ค๋ค„๋ณด๋Š” ๊ณ„๊ธฐ๊ฐ€ ๋˜์—ˆ๋‹ค.
ํŒŒ์ด์ฌ ๊ณต๋ถ€๋ฅผ ๋” ์—ด์‹ฌํžˆ ํ•˜์ž๋Š” ์˜์š•์ด ์ƒ๊ธฐ๋„๋ก ๋งŒ๋“ค์–ด์ค€ ์–ธ์–ด..

profile
To be a DataScientist

0๊ฐœ์˜ ๋Œ“๊ธ€