library(foreign)
read.spss("Koweps_hpc10_2015_beta1.sav")
raw_welfare <- read.spss("Koweps_hpc10_2015_beta1.sav")
as.data.frame(raw_welfare)
welfare <- as.data.frame(raw_welfare)
str(welfare)
welfare <- welfare %>% rename(gender = h10_g3, birth = h10_g4, marriage = h10_g10, religion = h10_g11, income = p1002_8aq1, job = h10_eco9, region = h10_reg7) %>% select(gender, birth, marriage, religion, income, job, region)
welfare$gender <- ifelse(welfare$gender == 1, 'male', 'female')
class(welfare$birth)
summary(welfare$birth)
qplot(welfare$birth)
boxplot(welfare$birth) #이상치 확인
sum(is.na(welfare$birth)) #결측치 확인
#방법1
welfare$birth
welfare %>% mutate(age = (2015 - birth + 1)) %>% head()
#방법2
welfare$age <- 2015 - welfare$birth + 1
summary(welfare$age)
welfare %>% group_by(age) %>% summarise(mean_income = mean(income, na.rm = T))
#mean_income값이 NA로 나온다, 왜?
welfare %>% filter(!is.na(income)) %>% group_by(age) %>%
summarise(mean_income = mean(income))
welfare %>% filter(!is.na(income)) %>% group_by(age) %>%
summarise(mean_income = mean(income)) %>% head()
df2 <- welfare %>% filter(!is.na(income)) %>% group_by(age) %>%
summarise(mean_income = mean(income))
ggplot(df2, aes(x=age, y=mean_income)) + geom_line()
ggplot(df2, aes(x=age, y=mean_income)) + geom_line(color = 'skyblue', size=1) +
geom_point(color='blue', size=1)
welfare %>% mutate(age_gen = ifelse(age < 30, 'young', ifelse(age <= 50, 'middle', 'old')))
welfare <- welfare %>% mutate(age_gen = ifelse(age < 30, 'young',
ifelse(age <= 50, 'middle', 'old')))
welfare %>% filter(!is.na(income)) %>% group_by(age_gen) %>%
summarise(age_gen_income = mean(income))
df3 <- welfare %>% filter(!is.na(income)) %>% group_by(age_gen) %>%
summarise(age_gen_income = mean(income))
ggplot(df3, aes(x=age_gen, y=age_gen_income, fill=age_gen)) + geom_col()
welfare %>% group_by(age_gen) %>% summarise(age_gen_income = mean(income, na.rm = T))
welfare %>% group_by(age_gen) %>% summarise(age_gen_income = mean(income, na.rm = T)) %>%
ggplot(aes(x=age_gen, y=age_gen_income, fill=age_gen)) + geom_col()
ggplot(df3, aes(x=age_gen, y=age_gen_income, fill=age_gen)) +
geom_col() + scale_x_discrete(limits = c('young','middle','old'))
welfare %>% group_by(age_gen, gender) %>% summarise(gender_income = mean(income, na.rm = T))
df4 <- welfare %>% group_by(age_gen, gender) %>%
summarise(gender_income = mean(income, na.rm = T))
#시각화; geom_col()
ggplot(df4, aes(x=age_gen, y=gender_income, fill=gender)) + geom_col(position = 'dodge') +
scale_x_discrete(limits = c('young','middle','old'))
welfare %>% filter(!is.na(income)) %>% group_by(age, gender) %>%
summarise(mean_income = mean(income))
df5 <- welfare %>% filter(!is.na(income)) %>% group_by(age, gender) %>%
summarise(mean_income = mean(income))
#시각화; geom_line()
ggplot(df5, aes(x=age, y=mean_income, color=gender)) + geom_line()
getwd()
read.csv('ahn.txt', fileEncoding = 'cp949')
#UTF-8 : 한글의 저장방식
#cp949 : 윈도우에서 한글로 작성한 파일이 저장되는 형태
#fileEncoding = 'cp949' : 파일의 저장방식이 'cp949'였음을 의미
readLines('ahn.txt')
-필요한 것 : 한글사전(KoNLP), JAVA
-KoNLP : package안에 포함된 사전을 통해 문서에 포함된 품사를 분석해 주는 기능을 한다
-NLP : Natural Language Processing; 자연어 처리
-자연어 : 일반적으로 생각하는 언어
-형식어 : 자연어와 반대되는 개념 (예: 1+1=2, CO2 등)
install.packages('multilinguer')
library(multilinguer)
install_jdk()
install.packages(c("hash","tau","Sejong","RSQLite","devtools","bit","rex","lazyeval","htmlwidgets","crosstalk","promises","later","sessioninfo","xopen","bit64","blob","DBI","memoise","plogr","covr","DT","rcmdcheck","rversions"), type = "binary")
install.packages('remotes')
remotes::install_github('haven-jeon/KoNLP', upgrade = 'never', INSTALL_opts = c('--no-multiarch'))
#remotes:: -> library(remotes)와 같다
library(rJava)
library(KoNLP)
#Fail to locate 'scala-library-2.11.8.jar'. Recommand to locate 'scala-library-2.11.8.jar' manually on C:/Users/admin/AppData/Local/R/win-library/4.2/KoNLP/java
#C:/Users/admin/AppData/Local/R/win-library/4.2/KoNLP/java 위치에 'scala-library-2.11.8.jar'가 있어야 한다
#'scala-library-2.11.8.jar'파일 설치 후, R studio를 껐다가 다시 실행, library(KoNLP)부터 다시 하면 된다
extractNoun('이 문장에서 명사만 추출되었다면 성공입니다.')
text <- read.csv('ahn.txt', fileEncoding = 'cp949')
text
text <- "'ahn.txt' 파일의 글 전체를 그대로 복사해서 넣기"
useNIADic()
extractNoun(text)
nouns <- extractNoun(text)
mode(nouns) #"character"가 나온다
#list가 나왔다면, nouns <- unlist(nouns)를 실행한다
nouns
useSejongDic()
useNIADic()
nchar(nouns) #단어의 글자수가 몇개인지 확인 (한글자로 된 단어를 제외하기 위해)
nouns[1:10]
nouns[nchar(nouns) > 1] #단어의 글자수가 1보다 큰지 - TURE, FALSE로 확인
nouns <- nouns[nchar(nouns) > 1]
table(nouns)
#빈도수가 큰 것부터 나열하기
as.data.frame(table(nouns))
as.data.frame(table(nouns)) %>% arrange(desc(Freq))
df <- as.data.frame(table(nouns)) %>% arrange(desc(Freq)) %>% head(10)
#그래프로 시각화
ggplot(df, aes(x=nouns, y=Freq, fill=nouns)) + geom_col()
ggplot(df, aes(x=nouns, y=Freq, fill=nouns)) + geom_col() + coord_flip()
#wordcloud2() 시각화
install.packages("wordcloud2")
library(wordcloud2)
wordcloud2(df)
#df를 정의할 때 head()의 숫자를 크게 하면 더 많은 단어들을 볼 수도 있다
#scala-library-2.11.8.jar 을 오류메시지가 알려주는 위치에 넣는다.
library(KoNLP)
extractNoun('이 문장에서 명사만 추출되었다면 성공입니다.')
useNIADic()
#useSejongDic()
text <- "안녕하십니까 안철수입니다. 저는~"
text <- readLines("ahn.txt")
extractNoun(text)
nouns <- extractNoun(text)
mode(nouns) #list
nouns <- unlist(nouns)
nouns <- nouns[nchar(nouns) > 1]
df <- as.data.frame(table(nouns)) %>% arrange(desc(Freq)) %>% head(20)
ggplot(df, aes(x=nouns, y=Freq, fill=nouns)) + geom_col() + coord_flip()
wordcloud2(df)