< ์์ ์ฝ๋ >
library(ggplot2)
๋น ๊ทธ๋ํ ์์ฑ
ggplot(data= diamonds)
x์ถ์ด carat์ธ ํ์คํ ๊ทธ๋จ ์์ฑ
ggplot(data= diamonds) + geom_histogram(aes(x=carat))
์์ ๋์ผ
ggplot(data= diamonds, aes(x=carat)) + geom_histogram()
ํ์์ผ๋ก ๋ฐ๋ํจ์ ์์ฑ
ggplot(data= diamonds) + geom_density(aes(x=carat), fill = "grey50")
๋นจ๊ฐ์์ผ๋ก ๋ฐ๋ํจ์ ์์ฑ
ggplot(data= diamonds) + geom_density(aes(x=carat), fill = "red")
x์ถ carat, y์ถ price์ธ ์ฐ์ ๋ ์ฐ๊ธฐ
ggplot(data= diamonds,aes(x=carat, y = price)) + geom_point()
์์ ๋์ผ
ggplot(data= diamonds) + geom_point(aes(x=carat, y = price))
< ggplot์ ๊ฒฝ์ฐ ๋ณ์ ์ค์ ์ผ๋ก + ๋ก ํํ ๊ฐ๋ฅ!
g1 <- ggplot(data= diamonds,aes(x=carat, y = price))
g2 <- geom_point(aes(color = color))
g1 + g2
g3 <- ggplot(data= diamonds)
g4 <- geom_point(aes(x=carat, y = price, color = color))
g3 + g4
x์ถ์ด carat์ด๊ณ y์ถ์ด color์ธ ๊ฒฝ์ฐ์ ํ์คํ ๊ทธ๋จ ์ ๋ถ ์ถ๋ ฅ(numerical๋ง)
ggplot(diamonds, aes(x = carat)) + geom_histogram() + facet_wrap(~color)
x = 1๋ก ๊ณ ์ , y = carat์ธ boxplot
ggplot(diamonds, aes(y = carat, x = 1)) + geom_boxplot()
x = cut, y = carat์ธ ๋ชจ๋ boxplot
ggplot(diamonds, aes(y = carat, x = cut)) + geom_boxplot()
ggplot(diamonds, aes(y = carat, x = cut)) + geom_violin()
# ์์๋ง ๋ฐ๊พธ์ด ์ถ๋ ฅํ ํํ
ggplot(diamonds, aes(y = carat, x = cut)) + geom_point() + geom_violin()
ggplot(diamonds, aes(y = carat, x = cut)) + geom_violin() + geom_point()
economics <- as.data.frame(ggplot2::economics)
economics
ggplot(economics, aes( x = date, y = pop)) + geom_line()
install.packages("lubridate")
library(lubridate)
economics$year <- year(economics$date)
economics$month <- month(economics$date , label = TRUE)
econ2000 <- economics[which(economics$year >= 2000),]
head(econ2000,5)
library(scales)
g1 <- ggplot(econ2000, aes(x = month, y = pop))
g2 <- geom_line(aes(color = factor(year), group = year))
g3 <- scale_color_discrete(name = "Year")
g4 <- scale_y_continuous(labels = comma)
g <- g1 + g2 + g3 + g4
g + labs( title = "Population Growth", x = "Month", y = "Population")
< ๊ฒฐ๊ณผ >
mpg ๋ฐ์ดํฐ๋ฅผ ๋ถ๋ฌ์ ์ฌ๋ณธ์ ๋ง๋์ธ์.
1. manufacturer, model, displ, drv, cty, hwy ์ถ์ถ
2. cty์ ๊ฐ์ ์ด์ฉํ์ฌ grade ๋ณ์ ์์ฑํ์ธ์.
- 19๋ณด๋ค ๊ฐ๊ฑฐ๋ ํฌ๋ฉด grade ๋ณ์์ H
19๋ณด๋ค ์๊ณ 14๋ณด๋ค ๊ฐ๊ฑฐ๋ ํฌ๋ฉด M
๊ทธ ์ธ๋ L
- grade๋ณ์๋ฅผ mpg ๋ฐ์ดํฐ ํ๋ ์ ์ฌ๋ณธ์ ์ถ๊ฐํ์ธ์.
- grade๋ณ์๊ฐ H, M, L๊ฐ ๊ฐ๊ฐ ๋ช๊ฐ์ฉ์ธ์ง ์นด์ดํ ํ์ธ์.
- ํ๋ฒ์ cty์ hwy์ ํ์คํ ๊ทธ๋จ ์ฐจํธ๋ฅผ ๊ทธ๋ ค๋ณด์ธ์.
- grade์ ๋ณ์๊ฐ๋ณ๋ก ๋ถํฌ๋ฅผ ํ์ธ(boxplot()
mpg <- as.data.frame(ggplot2::mpg) 1. mpg <- mpg %>% filter(manufacturer, model, displ, drv, cty, hwy) mpg <- mpg[, c('manufacturer', 'model', 'displ','drv', 'cty', 'hwy')] 2,3. mpg %>% mutate(grade = ifelse(cty >= 19, 'H', ifelse(cty >= 14, 'M', 'L'))) 4. table(mpg$grade) 5. par(mfrow = c(1,2)) for (i in 5:6) ( hist(mpg[,i], main = colnames(df_mpg)[i], col = 'yellow') ) 6. boxplot(mpg$cty ~ mpg$grade)
omit <- boxplot.stats(df.mpg$hwy)$out
df.mpg$hwy[df.mpg$hwy %in% omit] <- NA
rowSums(is.na(df.mpg))
sum(rowSums(is.na(df.mpg)) > 0)
no.outlier.df.mpg <- df.mpg[complete.cases(df.mpg),]
sum(rowSums(is.na(no.outlier.df.mpg)) > 0)
๋ฐ์ดํฐ ํ๋ ์ ์ ๋ ฌ
df.mpg <- data.frame(ggplot2::mpg) order(df.mpg$hwy) df.mpg[order(df.mpg$hwy),] df.mpg[order(df.mpg$hwy, decreasing=T),] df.mpg[order(df.mpg$hwy, decreasing=T, df.mpg$displ),]
๋ฐ์ดํฐ ์ง๊ณ
- aggregate()ํจ์
: 2์ฐจ์ ๋ฐ์ดํฐ, ๋ฐ์ดํฐ ๊ทธ๋ฃน์ ๋ํด ํ๊ท , ํฉ์ ๊ตฌํ๋ ์์student.list <- read.csv("example_studentlist.csv") aggregate(student.list$weight ~student.list$bloodtype, data = student.list, FUN = mean) aggregate(student.list$weight, by=list(student.list$bloodtype),FUN=mean)
math <- data.frame(name=c("a","b","c"), math=c(70,80,90))
sci <- data.frame(name=c("a","b","d"), math=c(10,20,30))
math
sci
merge(math, sci, by=c("name"))
# left join (์ผ์ชฝ์ ์ ๋ถ ์ถ๋ ฅ)
merge(math, sci, by=c("name"), all.x = T)
# right join (์ค๋ฅธ์ชฝ์ ์ ๋ถ ์ถ๋ ฅ)
merge(math, sci, by=c("name"), all.y = T)
# outer join (์ ๋ถ ์ถ๋ ฅ ๋๋)
merge(math, sci, by=c("name"),all= T)