๐Ÿ’ฏ 11/3 R ์ •๋ฆฌ

๊น€ํƒœ์ค€ยท2022๋…„ 12์›” 4์ผ
0

R-Studio

๋ชฉ๋ก ๋ณด๊ธฐ
3/5

ggplot2 ํŒจํ‚ค์ง€ ํŠน์ง•

  • ๋ฐ์ดํ„ฐ๋ฅผ ์ƒ‰, ํ˜•ํƒœ, ํฌ๊ธฐ ๋“ฑ์œผ๋กœ ๋‹ฌ๋ฆฌ ํ‘œ์‹œํ•˜๊ฑฐ๋‚˜ ๋ฒ”๋ก€(๋ ˆ์ „๋“œ)๋ฅผ ์ถ”๊ฐ€ํ•˜๋Š” ์ผ์ด ํ›จ์”ฌ ๋” ์šฉ์ด
  • ๊ทธ๋ž˜ํ”„ ๋งŒ๋“œ๋Š” ์†๋„ ํ–ฅ์ƒ
  • ๊ธฐ๋ณธ ๊ทธ๋ž˜ํ”„ ํ•จ์ˆ˜ 30์ค„ ๊ฐ€๋Ÿ‰ >> ggplot2 1์ค„๋งŒ์— ์ž‘์„ฑ ๊ฐ€๋Šฅ

ggplot2 ํŒจํ‚ค์ง€ ๊ตฌ์„ฑ

  • ggplot() : ์‚ฌ์šฉํ•  ๋ฐ์ดํ„ฐ๋ฅผ x์ถ•, y์ถ•, colour ๋“ฑ ๊ทธ๋ž˜ํ”„ ์š”์†Œ์— ๋งคํ•‘, aesํ•จ์ˆ˜ ์‚ฌ์šฉ
    ex) ggplot(diamonds, aes(x=x, y=price, colour = clarity))
  • geom() : ๋‹ค์–‘ํ•œ ๊ทธ๋ž˜ํ”„ ์ค‘ ์–ด๋–ค ๊ทธ๋ž˜ํ”„ ๊ทธ๋ฆด์ง€ ์„ ํƒ, ๊ธฐํ•˜๊ฐ์ฒด ํ•จ์ˆ˜๋ผ ๋ถ€๋ฆ„
    ex) ggplot() + geom_line(), geom_point(), geom_histogram() ๋“ฑ
  • theme() : ๋งŽ์€ ์–‘์˜ ๋””์ž์ธ ์š”์†Œ๋ฅผ ์ •ํ•  ์ˆ˜ ์žˆ๋Š” ํ•จ์ˆ˜

< ์˜ˆ์ œ ์ฝ”๋“œ >

library(ggplot2)
๋นˆ ๊ทธ๋ž˜ํ”„ ์ƒ์„ฑ
ggplot(data= diamonds)
x์ถ•์ด carat์ธ ํžˆ์Šคํ† ๊ทธ๋žจ ์ƒ์„ฑ
ggplot(data= diamonds) + geom_histogram(aes(x=carat))
์œ„์™€ ๋™์ผ
ggplot(data= diamonds, aes(x=carat)) + geom_histogram()
ํšŒ์ƒ‰์œผ๋กœ ๋ฐ€๋„ํ•จ์ˆ˜ ์ƒ์„ฑ
ggplot(data= diamonds) + geom_density(aes(x=carat), fill = "grey50") 
๋นจ๊ฐ„์ƒ‰์œผ๋กœ ๋ฐ€๋„ํ•จ์ˆ˜ ์ƒ์„ฑ
ggplot(data= diamonds) + geom_density(aes(x=carat), fill = "red") 
x์ถ• carat, y์ถ• price์ธ ์‚ฐ์ ๋„ ์ฐ๊ธฐ
ggplot(data= diamonds,aes(x=carat, y = price)) + geom_point() 
์œ„์™€ ๋™์ผ
ggplot(data= diamonds) + geom_point(aes(x=carat, y = price)) 

< ggplot์˜ ๊ฒฝ์šฐ ๋ณ€์ˆ˜ ์„ค์ •์œผ๋กœ + ๋กœ ํ‘œํ˜„ ๊ฐ€๋Šฅ!
g1 <- ggplot(data= diamonds,aes(x=carat, y = price))
g2 <- geom_point(aes(color = color))
g1 + g2
g3 <- ggplot(data= diamonds)
g4 <- geom_point(aes(x=carat, y = price, color = color))
g3 + g4

x์ถ•์ด carat์ด๊ณ  y์ถ•์ด color์ธ ๊ฒฝ์šฐ์˜ ํžˆ์Šคํ† ๊ทธ๋žจ ์ „๋ถ€ ์ถœ๋ ฅ(numerical๋งŒ)
ggplot(diamonds, aes(x = carat)) + geom_histogram() + facet_wrap(~color)
x = 1๋กœ ๊ณ ์ •, y = carat์ธ boxplot
ggplot(diamonds, aes(y = carat, x = 1)) + geom_boxplot()
x = cut, y = carat์ธ ๋ชจ๋“  boxplot
ggplot(diamonds, aes(y = carat, x = cut)) + geom_boxplot()
ggplot(diamonds, aes(y = carat, x = cut)) + geom_violin()
# ์ˆœ์„œ๋งŒ ๋ฐ”๊พธ์–ด ์ถœ๋ ฅํ•œ ํ˜•ํƒœ
ggplot(diamonds, aes(y = carat, x = cut)) + geom_point() + geom_violin()
ggplot(diamonds, aes(y = carat, x = cut)) + geom_violin() + geom_point()

ggplot2 ๊บพ์€์„  ๊ทธ๋ž˜ํ”„

  • ์ฃผ๋กœ ์—ฐ์†ํ•˜๋Š” ๋ณ€์ˆ˜๋ฅผ ํ‘œ์‹œํ•˜๋Š”๋ฐ ์‚ฌ์šฉ, ๋ฒ”์ฃผํ˜• ์ž๋ฃŒ์—๋„ ์‚ฌ์šฉ๊ฐ€๋Šฅ!
economics <- as.data.frame(ggplot2::economics)
economics
ggplot(economics, aes( x = date, y = pop)) + geom_line()

install.packages("lubridate")
library(lubridate)
economics$year <- year(economics$date)
economics$month <- month(economics$date , label = TRUE)
econ2000 <- economics[which(economics$year >= 2000),]
head(econ2000,5)
library(scales)

g1 <- ggplot(econ2000, aes(x = month, y = pop))
g2 <- geom_line(aes(color = factor(year), group = year)) 
g3 <- scale_color_discrete(name = "Year")
g4 <- scale_y_continuous(labels = comma)
g <- g1 + g2 + g3 + g4
g + labs( title = "Population Growth", x = "Month", y = "Population")

< ๊ฒฐ๊ณผ >

์‹ค์Šต ์˜ˆ์ œ

mpg ๋ฐ์ดํ„ฐ๋ฅผ ๋ถˆ๋Ÿฌ์™€ ์‚ฌ๋ณธ์„ ๋งŒ๋“œ์„ธ์š”.
1. manufacturer, model, displ, drv, cty, hwy ์ถ”์ถœ
2. cty์˜ ๊ฐ’์„ ์ด์šฉํ•˜์—ฌ grade ๋ณ€์ˆ˜ ์ƒ์„ฑํ•˜์„ธ์š”.

  • 19๋ณด๋‹ค ๊ฐ™๊ฑฐ๋‚˜ ํฌ๋ฉด grade ๋ณ€์ˆ˜์— H
    19๋ณด๋‹ค ์ž‘๊ณ  14๋ณด๋‹ค ๊ฐ™๊ฑฐ๋‚˜ ํฌ๋ฉด M
    ๊ทธ ์™ธ๋Š” L
  1. grade๋ณ€์ˆ˜๋ฅผ mpg ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ์‚ฌ๋ณธ์— ์ถ”๊ฐ€ํ•˜์„ธ์š”.
  2. grade๋ณ€์ˆ˜๊ฐ€ H, M, L๊ฐ€ ๊ฐ๊ฐ ๋ช‡๊ฐœ์”ฉ์ธ์ง€ ์นด์šดํŒ…ํ•˜์„ธ์š”.
  3. ํ•œ๋ฒˆ์— cty์™€ hwy์˜ ํžˆ์Šคํ† ๊ทธ๋žจ ์ฐจํŠธ๋ฅผ ๊ทธ๋ ค๋ณด์„ธ์š”.
  4. grade์˜ ๋ณ€์ˆ˜๊ฐ’๋ณ„๋กœ ๋ถ„ํฌ๋ฅผ ํ™•์ธ(boxplot()
mpg <- as.data.frame(ggplot2::mpg)
1. mpg <- mpg %>% filter(manufacturer, model, displ, drv, cty, hwy)
mpg <- mpg[, c('manufacturer', 'model', 'displ','drv', 'cty', 'hwy')]
2,3. mpg %>% mutate(grade = ifelse(cty >= 19, 'H', ifelse(cty >= 14, 'M', 'L')))
4. table(mpg$grade)
5. par(mfrow = c(1,2))
for (i in 5:6) (
hist(mpg[,i], main = colnames(df_mpg)[i], col = 'yellow')
)
6. boxplot(mpg$cty ~ mpg$grade)

๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ

์ด์ƒ์น˜ ํ™•์ธ ๋ฐ ์ œ๊ฑฐ

  • boxplot() ์ด์šฉ
omit <- boxplot.stats(df.mpg$hwy)$out
df.mpg$hwy[df.mpg$hwy %in% omit] <- NA
rowSums(is.na(df.mpg))
sum(rowSums(is.na(df.mpg)) > 0)
no.outlier.df.mpg <- df.mpg[complete.cases(df.mpg),]
sum(rowSums(is.na(no.outlier.df.mpg)) > 0)

๋ฐ์ดํ„ฐ ์ •๋ ฌ

๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ์ •๋ ฌ

df.mpg <- data.frame(ggplot2::mpg)
order(df.mpg$hwy)
df.mpg[order(df.mpg$hwy),]
df.mpg[order(df.mpg$hwy, decreasing=T),]
df.mpg[order(df.mpg$hwy, decreasing=T, df.mpg$displ),]

๋ฐ์ดํ„ฐ ์ง‘๊ณ„

  • aggregate()ํ•จ์ˆ˜
    : 2์ฐจ์› ๋ฐ์ดํ„ฐ, ๋ฐ์ดํ„ฐ ๊ทธ๋ฃน์— ๋Œ€ํ•ด ํ‰๊ท , ํ•ฉ์„ ๊ตฌํ•˜๋Š” ์ž‘์—…
student.list <- read.csv("example_studentlist.csv")
aggregate(student.list$weight ~student.list$bloodtype, data = student.list, FUN = mean)
aggregate(student.list$weight, by=list(student.list$bloodtype),FUN=mean)

๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ

math <- data.frame(name=c("a","b","c"), math=c(70,80,90))
sci <- data.frame(name=c("a","b","d"), math=c(10,20,30))
math
sci
merge(math, sci, by=c("name"))
# left join (์™ผ์ชฝ์€ ์ „๋ถ€ ์ถœ๋ ฅ)
merge(math, sci, by=c("name"), all.x = T)
# right join (์˜ค๋ฅธ์ชฝ์€ ์ „๋ถ€ ์ถœ๋ ฅ)
merge(math, sci, by=c("name"), all.y = T)
# outer join (์ „๋ถ€ ์ถœ๋ ฅ ๋Š๋‚Œ)
merge(math, sci, by=c("name"),all= T)
profile
To be a DataScientist

0๊ฐœ์˜ ๋Œ“๊ธ€