๐Ÿ’ฏ 11/17 R ์ •๋ฆฌ

๊น€ํƒœ์ค€ยท2022๋…„ 12์›” 4์ผ
1

R-Studio

๋ชฉ๋ก ๋ณด๊ธฐ
4/5
post-thumbnail

ํžˆ์Šคํ† ๊ทธ๋žจ

library(ggplot2)
library(dplyr)
df = read.csv('student_info.csv')
๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์˜ ํ˜•ํƒœ, ์ปฌ๋Ÿผ ํ˜•์‹ ํ™•์ธ
str(df)

ggplot(df, aes(x=weight)) + geom_histogram(binwidth = 1)
ggplot(df, aes(x=weight)) + geom_histogram(binwidth = 5)
ggplot(df, aes(x=weight)) + geom_histogram(binwidth = 1, color = 'black, fill = 'gray')
ggplot(df, aes(x=weight)) + geom_histogram(binwidth = 1, color = 'black', fill = 'gray') + geom_vline(xintercept = mean(weight), color = 'red', linetype = 'dashed', size = 1)
# ํ˜ˆ์•กํ˜• ๋ณ„๋กœ ์ถœ๋ ฅ
ggplot(df, aes(x=weight, fill = bt)) + geom_histogram(binwidth = 5)
ggplot(df.student.info, aes(x=weight, fill = bt)) + geom_histogram(binwidth = 5, position = "dodge") + theme(legend.position = "top")

์‚ฐ์ ๋„

df.student.info = read.csv("student_info.csv")
str(df.student.info)
ggplot(df.student.info, aes(x=weight, y=height)) + geom_point()
ggplot(df.student.info, aes(x=weight, y=height, color=bt)) + geom_point(size = 4) + ggtitle("๋ชธ๋ฌด๊ฒŒ์™€ ์ฒด์ค‘")

boxplot

df.student.info = read.csv("student_info.csv")
str(df.student.info)                                                                                                              
ggplot(df.student.info, aes(y=weight)) + geom_boxplot(fill = 'steelblue')
ggplot(df.student.info, aes(y=weight, fill = bt)) + geom_boxplot()

์„ ๊ทธ๋ฆฌ๊ธฐ

library(ggplot2)
help(Nile)
year <- 1871:1970
flow.river <- data.frame(Nile)
df.flow.river <- data.frame(year, Nile)
str(df.flow.river)
ggplot(df.flow.river, aes(x=year, y=Nile)) + geom_line(col = 'red')

diamonds ์˜ˆ์ œ

library(ggplot2)
ggplot(data = diamonds)
ggplot(data = diamonds) + geom_histogram(aes(x=carat))
ggplot(data = diamonds, aes(x=carat)) + geom_histogram()
ggplot(data = diamonds) + geom_density(aes(x=carat), fill = 'grey50')
ggplot(data = diamonds) + geom_density(aes(x=carat), fill = 'red')

ggplot(data = diamonds, aes(x=carat, y=price)) + geom_point()
ggplot(data = diamonds) + geom_point(aes(x=carat, y=price))

g1 <- ggplot(data = diamonds, aes(x=carat, y=price))
g2 <- geom_point(aes(color = color))
g1 + g2

g3 <- ggplot(data = diamonds)
g4 <- geom_point(aes(x=carat, y=price, color=color))
g3+g4

์‹ค์Šต

์ „๊ตญ์ธ๊ตฌ์กฐ์‚ฌ ์ž๋ฃŒ(example_population_f.csv )๋ฅผ ์ด์šฉํ•˜์—ฌ ๋‹ค์–‘ํ•œ ๋ฐ์ดํ„ฐ ๋ถ„์„์„ ์ง„ํ–‰ํ•˜์„ธ
์š”.

library(ggplot2)
library(dplyr)
library(ggthemes)
df <- read.csv('example_population_f.csv', header = T, fileEncoding = 'cp949', encoding = 'UTF-8')
df
str(df)
# ์ฒซ ์—ด ์ œ๊ฑฐ
df <- df[,-1]
df
# df ์ปฌ๋Ÿผ์—์„œ Provinces ๊ฐ€ ์ถฉ์ฒญ๋ถ๋„, ์ถฉ์ฒญ๋‚จ๋„์ธ ํ–‰ ์ถ”์ถœ
df2 <- filter(df, Provinces  == '์ถฉ์ฒญ๋ถ๋„' | Provinces  == '์ถฉ์ฒญ๋‚จ๋„')
# x์ถ•์€ city, y์ถ•์€ ์ธ๊ตฌ๋กœ barplot()
graph <- ggplot(df, aes(x=City, y=Population, fill=Provinces)) + geom_bar(stat = 'identity') +theme_wsj()

# ๋ณด๊ธฐ ์ข‹๊ฒŒ ์˜ค๋ฆ„์ฐจ์ˆœ ์ •๋ ฌ
graph_order <- ggplot(df2, aes(x=reorder(City, Population), y=Population, fill = Provinces)) + geom_bar(stat = 'identity') + theme_wsj()
graph_order

df3 <-filter(df, SexRatio > 1, PersInHou < 2)
df
graph2 <- ggplot(df3, aes(x=City, y=SexRatio, fill = Provinces)) + geom_bar(stat='identity') + theme_wsj()
graph2

df <- read.csv('example_population_f.csv', header = T, fileEncoding = 'cp949', encoding = 'UTF-8')
df <- df[, -1]

df <- mutate(df, SexF = ifelse(SexRatio < 1, '์—ฌ์ž๋น„์œจ์ด ๋†’์Œ', ifelse(SexRatio > 1, '๋‚จ์ž๋น„์œจ์ด ๋†’์Œ', '๋‚จ๋…€๋น„์œจ์ด ๊ฐ™์Œ')))
df$SexF <- factor(df$SexF)
df2 <- filter(df, Provinces == '๊ฒฝ๊ธฐ๋„') 
graph <- ggplot(df2, x=City, y = SexRatio-1, fill=SexF) + geom_bar(stat = 'identity', position = 'identity') + theme_wsj()
graph

< ๊ฒฝ๊ธฐ๋„ ์„ฑ๋น„ >

df4 <-filter(df, Provinces == '์„œ์šธํŠน๋ณ„์‹œ')
graph2 <- ggplot(df4, aes(x=City, y = SexRatio - 1, fill=SexF)) + geom_bar(stat = 'identity', position = 'identity')+ theme_whj

< ์„œ์šธํŠน๋ณ„์‹œ ์„ฑ๋น„ >

์‹ค์Šต

mpg ๋ฐ์ดํ„ฐ๋ฅผ ์ด์šฉํ•˜์—ฌ ๋‹ค์Œ ์ฐจํŠธ๋ฅผ ๋งŒ๋“ค์–ด ๋ณด์„ธ์š”.
1. mpg๋ฐ์ดํ„ฐ์˜ cty์™€ hwy ๊ฐ„์— ์–ด๋–ค ๊ด€๊ณ„๊ฐ€ ์žˆ๋Š”์ง€ ์•Œ์•„๋ณด๋ ค๊ณ  ํ•ฉ๋‹ˆ๋‹ค. x์ถ•์€ cty, y์ถ•์€
hwy๋กœ ๋œ ์‚ฐ์ ๋„๋ฅผ ๋งŒ๋“ค์–ด ๋ณด์„ธ์š”.

  • midwest ๋ฐ์ดํ„ฐ๋ฅผ ์ด์šฉํ•˜์—ฌ ๋‹ค์Œ์„ ๋ถ„์„ํ•˜์„ธ์š”
  1. midwest ๋ฐ์ดํ„ฐ๋ฅผ ์ด์šฉํ•ด ์ „์ฒด ์ธ๊ตฌ์™€ ์•„์‹œ์•„์ธ ์ธ๊ตฌ ๊ฐ„์— ์–ด๋–ค ๊ด€๊ณ„๊ฐ€ ์žˆ๋Š”์ง€ ์•Œ์•„๋ณด
    ๋ ค๊ณ  ํ•ฉ๋‹ˆ๋‹ค. x์ถ•์€ poptotal, y์ถ•์€ popasian์œผ๋กœ ์‚ฐ์ ๋„๋ฅผ ๋งŒ๋“ค์–ด ๋ณด์„ธ์š”.
  2. ๊ทธ๋ฆฌ๊ณ , ์ „์ฒด ์ธ๊ตฌ๋Š” 50๋งŒ ์ดํ•˜, ์•„์‹œ์•„์ธ ์ธ๊ตฌ๋Š” 1๋งŒ๋ช… ์ดํ•˜์ธ ์ง€์—ญ๋งŒ ์‚ฐ์ ๋„์— ํ‘œ์‹œ๋˜
    ๊ฒŒ ํ•˜์„ธ์š”.
  • ์ฐธ๊ณ  : ์ง€์ˆ˜ ํ‘œ์‹œ๋ฅผ ์ž์—ฐ์ˆ˜๋กœ ํ•˜๋ ค๋ฉด options(scipen = 99) vs options(scipen = 0)
mpg <- as.data.frame(ggplot2::mpg)
1. ggplot(data = mpg, aes(x=cty, y=hwy)) + geom_point()

midwest <- as.data.frame(ggplot2::midwest)
2. ggplot(data = midwest, aes(x=poptotal, y=popasian) + geom_point()
3. 
# ์ •์ˆ˜ ํ˜•ํƒœ๋กœ ํ‘œํ˜„
options(scipen = 99)
# ์ง€์ˆ˜ ํ˜•ํƒœ๋กœ ํ‘œํ˜„ ex) 2e + 6
options(scipen = 0)
ggplot(data = midwest, aes(x=poptotal, y=popasian) + geom_point() + xlim(0. 500000) + ylim(0,10000)

1.mpg ๋ฐ์ดํ„ฐ๋ฅผ ์ด์šฉํ•ด์„œ drv๋ณ„ ํ‰๊ท  hwy๋ฅผ ๋ง‰๋Œ€๊ทธ๋ž˜ํ”„๋กœ ํ‘œํ˜„
2. ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„์˜ x์ถ•์€ ๊ธฐ๋ณธ์ ์œผ๋กœ ์•ŒํŒŒ๋ฒณ ์ˆœ์œผ๋กœ ์ •๋ ฌ
3. reorder()๋ฅผ ์ด์šฉํ•˜์—ฌ ํฌ๊ธฐ์ˆœ์œผ๋กœ ์ •๋ ฌ ๊ฐ€๋Šฅ

mpg <- as.data.frame(ggplot2::mpg)
1. 
mpg_plot <- mpg %>% group_by(drv) %>% summarise(mean_hwy = mean(hwy))
3. ggplot(data = df_mpg, aes(x=reorder(drv, mean_hwy), y = mean_hwy)) + geom_col()
์ฝ”๋“œ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”

mpg ๋ฐ์ดํ„ฐ๋ฅผ ์ด์šฉํ•ด์„œ ๋ถ„์„ํ•˜์„ธ์š”
1. ์–ด๋–ค ํšŒ์‚ฌ์—์„œ ์ƒ์‚ฐํ•˜๋Š” โ€œsuvโ€ ์ฐจ์ข…์˜ ๋„์‹œ ์—ฐ๋น„๊ฐ€ ๋†’์€์ง€ ์•Œ์•„๋ณด๋ ค๊ณ  ํ•ฉ๋‹ˆ๋‹ค.
โ€œsuvโ€ ์ฐจ์ข…์„ ๋Œ€์ƒ์œผ๋กœ ํ‰๊ท  cty๊ฐ€ ๊ฐ€์žฅ ๋†’์€ ํšŒ์‚ฌ ๋‹ค์„ฏ ๊ณณ์„ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„๋กœ ํ‘œํ˜„
ํ•˜์„ธ์š”. ๋ง‰๋Œ€๋Š” ์—ฐ๋น„๊ฐ€ ๋†’์€ ์ˆœ์œผ๋กœ ์ •๋ ฌํ•˜์„ธ์š”.
2. ์ž๋™์ฐจ ์ค‘์—์„œ ์–ด๋–ค class๊ฐ€ ๊ฐ€์žฅ ๋งŽ์€์ง€ ์•Œ์•„๋ณด๋ ค๊ณ  ํ•ฉ๋‹ˆ๋‹ค. ์ž๋™์ฐจ ์ข…๋ฅ˜๋ณ„ ๋นˆ๋„
๋ฅผ ํ‘œํ˜„ํ•œ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„๋ฅผ ๋งŒ๋“ค์–ด ๋ณด์„ธ์š”.

1.
mpg2 <- mpg %>% filter(class = 'suv) %>% group_by(manufacturer) %>% summarise(mean_cty = mean(cty)) %>% arrange(desc(mean_cty)) %>% head(5)
ggplot(data = mpg2, aes(x=reorder(manufacturer, mean_cty), y=mean_cty)) + geom_col()
2. ggplot(data = mpg, aes(x=class))  geom_bar()

mpg ๋ฐ์ดํ„ฐ๋ฅผ ์ด์šฉํ•ด์„œ ๋ถ„์„ํ•˜์„ธ์š”
1. class๊ฐ€ โ€œcompactโ€, โ€œsubcompactโ€, โ€œsuvโ€์ธ ์ž๋™์ฐจ์˜ cty๊ฐ€ ์–ด๋–ป๊ฒŒ ๋‹ค๋ฅธ์ง€ ๋น„๊ตํ•ด ๋ณด๋ ค๊ณ 
ํ•ฉ๋‹ˆ๋‹ค. ์ƒ์ž ๊ทธ๋ž˜ํ”„๋กœ ๋งŒ๋“ค์–ด ๋ณด์„ธ์š”.

mpg_plot <- mpg %>% filter(class %in% c('compact', 'subcompact', 'suv')
ggplot(data = mpg_plot, aes(x=class, y=cty) + geom_boxplot()

ํŠธ๋ฆฌ๋งต

install.packages('treemap')
library(treemap)
tree_data <- data.frame(name = c('KIM', 'LEE', 'CHOI', 'HAN'), 
                        value = c(200, 300, 50, 600))
tree_data
treemap(tree_data, index = 'name', vSize = 'value', type = 'index')

tree_mpg <- as.data.frame(ggplot2::mpg)
tree_mpg <- tree_mpg[, c('manufacturer', 'model', 'hwy')]
tree_mpg
# FUNํ•จ์ˆ˜ ? : 
tree_mpg = aggregate(hwy ~ manufacturer + model, data = tree_mpg, FUN = mean)
tree_mpg

treemap(tree_mpg, index = c('manufacturer', 'model'), vSize = 'hwy', type = 'index')
treemap(tree_mpg, index = c('manufacturer', 'model'), vSize = 'hwy', type = 'index', palatte = 'Dark2')

profile
To be a DataScientist

0๊ฐœ์˜ ๋Œ“๊ธ€