-무조건 빈도수가 높다고 다 좋은 의미는 아니다
-문장에서 사용된 단어의 긍정과 부정의 빈도에 따라 평가
package1 <- c("ggplot2", "Rcpp", "dplyr", "ggthemes", "ggmap", "devtools", "RCurl", "igraph", "rgl", "lavaan", "semPlot")
package2 <- c("twitteR", "XML", "plyr", "doBy", "RJSONIO", "tm", "RWeka", "base64enc")
list.of.packages <- c(package1, package2)
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
library(twitteR)
library(plyr) #dplyr보다 더 먼저 나왔던
library(stringr) #r에서 글자를 다루게 해주는
load('samsung_tweets.rda')
samsung_tweets
st <- twListToDF(samsung_tweets)
head(st)
head(st, 1)
names(st)
st$text
st_text <- st$text #text 칼럼만 추출
st_text
#gsub() : 문자를 글로벌하게 대체하다
#gsub("[^가-힣]"," ",data) : 한글 빼고(^가-힣) 나머지에 띄어쓰기 처리
#"\\W" 에서의 W : 특수문자
st_text <- gsub("\\W", " ", st_text)
tail(st_text, 10)
st_df <- as.data.frame(st_text)
st_text
st_df
load("apple_tweets.rda")
apple_tweets
at <- twListToDF(apple_tweets)
head(at)
head(at, 1)
names(at)
at$text
at_text <- at$text
at_text
at_text <- gsub("\\W", " ", at_text)
tail(at_text, 10)
at_df <- as.data.frame(at_text)
at_text
at_df
https://github.com/The-ECG/BigData1_1.3.3_Text-Mining
https://stackoverflow.com/questions/35222946/score-sentiment-function-in-r-return-always-0
pos.word <- scan("positive-words.txt", what ="character", comment.char = ";")
neg.word <- scan("negative-words.txt", what ="character", comment.char = ";")
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
scores = laply(sentences, function(sentence, pos.words, neg.words) {
sentence = gsub('[^A-z ]','', sentence)
sentence = tolower(sentence);
word.list = str_split(sentence, '\\s+');
words = unlist(word.list);
pos.matches = match(words, pos.words);
neg.matches = match(words, neg.words);
pos.matches = !is.na(pos.matches);
neg.matches = !is.na(neg.matches);
score = sum(pos.matches) - sum(neg.matches);
return(score);
}, pos.words, neg.words, .progress=.progress );
scores.df = data.frame(score=scores, text=sentences);
return(scores.df);
}
samsung_scores <- score.sentiment(st_text, pos.word, neg.word, .progress = 'text')
samsung_scores$score
head(samsung_scores, 2)
hist(samsung_scores$score)
samsung_scores$score
apple_scores <- score.sentiment(at_text, pos.word, neg.word, .progress = 'text')
apple_scores$score
head(apple_scores, 2)
hist(apple_scores$score)
apple_scores$score
a <- dim(samsung_scores)[1]
b <- dim(apple_scores)[1]
as.data.frame(cbind(type = rep("samsung",a), score = samsung_scores[ , 1]))
as.data.frame(cbind(type = rep("apple",b), score = apple_scores[ , 1]))
#cbind() : 칼럼 두개를 붙인다
#rep() : 괄호 안의 내용을 복제
#rep("samsung",a) : samsung을 a번 복제
alls <- rbind(as.data.frame(cbind(type = rep("samsung",a), score = samsung_scores[ , 1])),
as.data.frame(cbind(type = rep("apple",b), score = apple_scores[ , 1])))
alls
ggplot(alls, aes(x=score, color=type)) + geom_density()
alls$type <- factor(alls$type)
alls$score <- as.integer(alls$score)
ggplot(alls, aes(x=score, color=type)) + geom_density()
load("gal_tweets.rda")
gal_tweets
gt <- twListToDF(gal_tweets)
head(gt, 1)
names(gt)
gt_text <- gt$text
gt_text <- gsub("\\W"," ",gt_text)
gt_text
gt_text <- gsub("[^가-힣]", " ", gt_text)
library(rJava)
library(KoNLP)
extractNoun(gt_text)
gt_nouns <- extractNoun(gt_text)
mode(gt_nouns)
gt_nouns <- unlist(gt_nouns)
gt_nouns
useNIADic()
nchar(gt_nouns)
gt_nouns <- gt_nouns[nchar(gt_nouns) > 1]
table(gt_nouns)
as.data.frame(table(gt_nouns)) %>% arrange(desc(Freq))
gt_df <- as.data.frame(table(gt_nouns)) %>% arrange(desc(Freq))
library(wordcloud2)
wordcloud2(gt_df)
load("iphone_tweets.rda")
iphone_tweets
it <- twListToDF(iphone_tweets)
head(it, 1)
names(it)
it_text <- it$text
it_text <- gsub("\\W"," ",it_text)
it_text
it_text <- gsub("[^가-힣]", " ", it_text)
extractNoun(it_text)
it_nouns <- extractNoun(it_text)
mode(it_nouns)
it_nouns <- unlist(it_nouns)
it_nouns
useNIADic()
nchar(it_nouns)
it_nouns <- it_nouns[nchar(it_nouns) > 1]
table(it_nouns)
as.data.frame(table(it_nouns)) %>% arrange(desc(Freq))
it_df <- as.data.frame(table(it_nouns)) %>% arrange(desc(Freq))
wordcloud2(it_df)
pos.kr.word <- scan("positive-words-ko-v2.txt", what ="character", comment.char = ";")
neg.kr.word <- scan("negative-words-ko-v2.txt", what ="character", comment.char = ";")
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
scores = laply(sentences, function(sentence, pos.words, neg.words) {
#sentence = gsub('[^A-z ]','', sentence)
#sentence = tolower(sentence);
word.list = str_split(sentence, '\\s+');
words = unlist(word.list);
pos.matches = match(words, pos.words);
neg.matches = match(words, neg.words);
pos.matches = !is.na(pos.matches);
neg.matches = !is.na(neg.matches);
score = sum(pos.matches) - sum(neg.matches);
return(score);
}, pos.words, neg.words, .progress=.progress );
scores.df = data.frame(score=scores, text=sentences);
return(scores.df);
}
#galaxy
galaxy_scores <- score.sentiment(gt_text, pos.kr.word, neg.kr.word, .progress = 'text')
galaxy_scores$score
head(galaxy_scores, 2)
hist(galaxy_scores$score)
#iphone
iphone_scores <-score.sentiment(it_text, pos.kr.word, neg.kr.word, .progress = 'text')
iphone_scores$score
head(iphone_scores, 2)
hist(iphone_scores$score)
c <- dim(galaxy_scores)[1]
d <- dim(iphone_scores)[1]
as.data.frame(cbind(type = rep("galaxy", c), score = galaxy_scores[ , 1]))
as.data.frame(cbind(type = rep("iphone", d), score = iphone_scores[ , 1]))
alls2 <- rbind(as.data.frame(cbind(type = rep("galaxy", c), score = galaxy_scores[ , 1])),
as.data.frame(cbind(type = rep("iphone", d), score = iphone_scores[ , 1])))
alls2
ggplot(alls2, aes(x=score, color=type)) + geom_density()
summary(alls2)
alls2$type <- factor(alls2$type)
alls2$score <- as.integer(alls2$score)
ggplot(alls2, aes(x=score, color=type)) + geom_density()
library(readxl)
read_excel("수강설문조사정리.xlsx")
data <- read_excel("수강설문조사정리.xlsx")
data$`강의 내용 만족도 [2. 나의 학습태도는 적극적이며 성실히 수업에 참여하고 있다고 생각하나요?]`
data$'나의참여도' <- text$`강의 내용 만족도 [2. 나의 학습태도는 적극적이며 성실히 수업에 참여하고 있다고 생각하나요?]`
data$`수강한 과목을 선택해주세요.`
data$'수강과목' <- text$`수강한 과목을 선택해주세요.`
summary(data)
data %>% group_by(수강과목) %>% summarise(count(나의참여도))
table(data$수강과목)
table(data$나의참여도)
data$나의참여도 <- ifelse(data$나의참여도 == "전혀 동의하지 않음", 0,
ifelse(data$나의참여도 == "동의하지 않음", 2,
ifelse(data$나의참여도 == "보통", 3,
ifelse(data$나의참여도 == "동의함", 4,
ifelse(data$나의참여도 == "매우 동의함", 5, 0)))))
ggplot(data, aes(x=나의참여도, fill=수강과목)) + geom_bar(position = 'dodge') + ylab('학생수')
-회귀모형은 lm(y~x) 함수를 이용한다 -> lm(y ~ x, data= )
-lm : linear model
-y ~ x : y는 x를 따른다
cars
head(cars)
summary(cars)
plot(cars)
pairs(cars)
library(ggplot2)
ggplot(cars, aes(x=speed, y=dist)) + geom_point()
#산점도를 확인한 결과 speed와 dist 사이에는 양의 상관관계가(선형적 연관성이) 존재하는 것으로 짐작할 수 있다 -> 상관분석을 통해 상관관계의 유무를 확인해보자
-cor() : 상관행렬(correlation matrix)
-cor.test(x, y) : x, y의 상관관계 검정
cor(cars)
cor.test(cars$speed, cars$dist)
#p-value = 1.49e-12 : p-value가 값이 매우 작으므로 유의, 추정된 상관계수 값은 약 0.8069
fit.cars <- lm(dist ~ speed, data=cars)
fit.cars #y = -17.579 + 3.932*x 임을 알 수 있다
summary(fit.cars) #Summary( )는 3가지 정보를 보여준다
str(fit.cars)
names(fit.cars)
attach(fit.cars)
coefficients
residuals #잔차들
data.frame(speed = 20)
data.frame(speed = 120)
data.frame(speed = c(20, 120))
predict(fit.cars, data.frame(speed = 20), interval = "confidence")
predict(fit.cars, data.frame(speed = c(20, 120)), interval = "confidence")