library(dplyr)
install.packages('gapminder')
library(gapminder)
data()
data(package = 'gapminder')
gapminder
gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarise(lifeExp = median(lifeExp))
df <- gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarise(lifeExp = median(lifeExp))
ggplot(df, aes(x=continent, y=lifeExp, fill=continent)) + geom_col()
df %>% ggplot(aes(x=continent, y=lifeExp, fill=continent)) + geom_col()
ggplot(df, aes(x=continent, y=lifeExp, fill=continent)) +
geom_col() + theme(legend.position = 'none')
ggplot(df, aes(x=continent, y=lifeExp, fill=continent)) +
geom_col() + theme(legend.position = 'top')
ggplot(gapminder, aes(x=continent, y=lifeExp, fill=continent)) +
geom_boxplot()
unique(gapminder$continent)
-interactive 하다 (상호작용이 가능하다)
-점 위에 마우스를 올려놓으면 정보가 뜸
-html로 저장이 가능하고, html에서 바로 데이터를 불러올 수도 있다
install.packages("plotly")
library(plotly)
ggplotly(mpg %>% ggplot(aes(x=displ, y=cty, color=drv)) +
geom_point() + theme(legend.position = 'none'))
data <- read.csv("https://raw.githubusercontent.com/plotly/datasets/master/school_earnings.csv")
fig <- plot_ly(data, x = ~Women, y = ~Men, text = ~School, type = 'scatter', mode = 'markers',
marker = list(size = ~Gap, opacity = 0.5))
fig <- fig %>% layout(title = 'Gender Gap in Earnings per University',
xaxis = list(showgrid = FALSE),
yaxis = list(showgrid = FALSE))
fig
mtcars
str(mtcars)
ggplot(mtcars, aes(x=cyl, fill=cyl)) + geom_bar()
mtcars$cyl <- as.factor(mtcars$cyl) #cyl은 num(연속적)이므로
ggplot(mtcars, aes(x=cyl, fill=cyl)) + geom_bar()
ggplot(mtcars, aes(x=cyl, fill=cyl)) + geom_bar() + scale_fill_hue(c=50)
ggplot(mtcars, aes(x=cyl, fill=cyl)) + geom_bar() +
scale_color_hue(h=c(0,90), c=50, l=65) #h는 각도, c는 clarity
ggplot(mtcars, aes(x=cyl, fill=cyl)) + geom_bar() + scale_fill_brewer(palette = 'set1')
ggplot(mtcars, aes(x=cyl, fill=cyl)) + geom_bar() + scale_fill_brewer(palette = 'set2')
ggplot(mtcars, aes(x=cyl, fill=cyl)) + geom_bar() + scale_fill_brewer(palette = 'set3')
ggplot(mtcars, aes(x=cyl, fill=cyl)) + geom_bar() +
scale_fill_manual(values = c('red','green','blue'))
#HEX 방식으로도 색을 지정할 수 있다 (예: #ffffff 등)
ggplot(mtcars, aes(x=cyl, fill=cyl)) + geom_bar() +
scale_fill_manual(values = c('#ffffff', '#c7674a', '#ff00ff'))
install.packages("ggrepel")
library(ggrepel)
ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point(color='red')
ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point(color='red') +
geom_text_repel(aes(label = rownames(mtcars)))
rownames(mtcars) #row의 이름 출력
mtcars %>% mutate(rowname=rownames(mtcars))
#다른 방법
mtcars$rowname <- rownames(mtcars)
mtcars
ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point(color='red') + geom_text_repel(aes(label = rowname))
library(ggthemes)
ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point(color='red') +
geom_text_repel(aes(label = rowname)) + theme_wsj()
ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point(color='red') +
geom_text_repel(aes(label = rowname)) + theme_economist()
ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point(color='red') +
geom_text_repel(aes(label = rowname)) + theme_economist_white()
install.packages("foreign")
library(foreign)
library(dplyr)
library(ggplot2)
#SPSS 데이터 읽어오기
getwd()
read.spss("Koweps_hpc10_2015_beta1.sav")
raw_welfare <- read.spss("Koweps_hpc10_2015_beta1.sav")
raw_welfare
#data.frame으로 변환
welfare <- as.data.frame(raw_welfare)
welfare
str(welfare)
glimpse(welfare)
head(welfare, 2)
summary(welfare)
#변수는 모두 몇 개 입니까?
ncol(welfare) #칼럼의 개수 - 957개
dim(welfare) #행 - 16664개, 열 - 957개
dim(welfare)[2] #열의 개수만
welfare <- welfare %>% rename(gender=h10_g3, birth=h10_g4, marriage=h10_g10, religion=h10_g11, income=p1002_8aq1, job=h10_eco9, region=h10_reg7) %>%
select(gender, birth, marriage, religion, income, job, region)
welfare
#다른 방법(?)
welfare <- welfare %>% select(gender=h10_g3, birth=h10_g4, marriage=h10_g10, religion=h10_g11, income=p1002_8aq1, job=h10_eco9, region=h10_reg7)
str(welfare)
head(welfare, 2)
summary(welfare) #NA가 몇 개인지 알 수 있다
plot(welfare) #전체 데이터를 한번에 확인
pairs(job~ income+gender, data = welfare) #일부 데이터만 확인
boxplot(welfare)
boxplot(welfare$income)
boxplot(welfare$income, welfare$job)
is.na(welfare)
table(welfare)
sum(is.na(welfare)) #TRUE의 개수만큼 '1'이 생긴다
colSums(is.na(welfare))
summary(welfare$income)
mean(welfare$income)
mean(welfare$income, na.rm = T)
range(welfare$income, na.rm = T) #range : 최소값에서 최대값의 범위
min(welfare$income, na.rm = T)
max(welfare$income, na.rm = T)
#NA를 0으로 대체
welfare$income <- ifelse(is.na(welfare$income), 0, welfare$income)
welfare$income
#is.na(welfare$income)
#welfare$income == NA 는 할 수 없다
#NA는 비어있는 값이기 때문에
#0을 NA로 대체
welfare$income <- ifelse(welfare$income == 0, NA, welfare$income)
welfare$income
summary(welfare$income)
-변수가 1개 이다.
ggplot(welfare, aes(x=income)) + geom_density()
ggplot(welfare, aes(x=income, color=gender)) + geom_density()
ggplot(welfare, aes(x=income, color=factor(gender))) + geom_density()
#gender는 데이터 형태가 num(연속적)이므로, 색을 지정해주기 위해 데이터 형태를 변경
ggplot(welfare, aes(x=income, color=factor(gender))) + geom_freqpoly()
#density가 좀더 매끄럽게 보여짐
#geom_freqpoly() : 빈도수가 count됨
summary(welfare$gender)
welfare$gender <- ifelse(welfare$gender == 1, "male", "female")
summary(welfare$gender)
table(welfare$gender) #table이 data.frame은 아니다
mode(table(welfare$gender)) #numeric으로 뜬다
typeof(table(welfare$gender)) #integer으로 뜬다
as.data.frame(table(welfare$gender))
gd <- as.data.frame(table(welfare$gender))
ggplot(gd, aes(x=Var1, y=Freq, fill=Var1)) + geom_col()
ggplot(gd, aes(x=Var1, y=Freq, fill=Var1)) + geom_col() +
xlab('성별') + ylab('명수') + theme(legend.position = 'none')
#rename()을 사용하여
gd <- gd %>% rename(gender=Var1, count=Freq)
gd
ggplot(gd, aes(x=gender, y=count, fill=gender)) + geom_col()
#names()을 사용하여
names(gd) <- c('성별','명수')
gd
ggplot(gd, aes(x=성별, y=명수, fill=성별)) + geom_col()
ggplot(welfare, aes(x=gender)) + geom_bar()
ggplot(welfare, aes(x=gender, fill=gender)) + geom_bar()
welfare %>% group_by(gender) %>% summarise(mean = mean(income, na.rm = T))
welfare %>% filter(!is.na(income)) %>% group_by(gender) %>% summarise(mean = mean(income))
welfare %>% group_by(gender) %>% summarise(mean = mean(income, na.rm = T)) %>%
ggplot(aes(x=gender, y=mean, fill=gender)) + geom_col()