#라이브러리 불러오기
library(tm)
library(ggplot2)
#경로지정
fakenews.path <- "data/fake_news/"
fakenews2.path <- "data/fake_news_2/"
realnews.path <- "data/real_news/"
realnews2.path <- "data/real_news_2/"
factnews.path <- "data/fact_news/"
factnews2.path <- "data/fact_news_2/"
#get함수 문자열을 얻는 함수 공통으로 사용함
get.msg <- function(path) {
con <- file(path, open="rt", encoding = "UTF-8")
text <- readLines(con)
msg <- text[seq(1,length(text),1)]
close(con)
return(paste(msg, collapse=""))
}
#경로의 파일들에 함수적용
fakenews.docs <- dir(fakenews.path)
all.fakenews <- sapply(fakenews.docs, function(p) get.msg(file.path(fakenews.path,p)))
#gettdm함수
get.tdm<-function(doc.vec){
doc.corpus<-VCorpus(VectorSource(doc.vec))
control<-list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
doc.dtm<-TermDocumentMatrix(doc.corpus,control)
return(doc.dtm)
}
#위의 함수를 이용해서 데이터프레임생성
fakenews.tdm<-get.tdm(all.fakenews)
fakenews.matrix <- as.matrix(fakenews.tdm)
fakenews.counts<-rowSums(fakenews.matrix)
fakenews.df<-data.frame(cbind(names(fakenews.counts),as.numeric(fakenews.counts)),stringsAsFactors=FALSE)
#데이터프레임 속성명 지정
names(fakenews.df)<-c("term", "frequency")
fakenews.df$frequency<-as.numeric(fakenews.df$frequency)
fakenews.occurrence<-sapply(1:nrow(fakenews.matrix), function(i){
length(which(fakenews.matrix[i,]>0))/ncol(fakenews.matrix)
})
fakenews.density<-fakenews.df$frequency/sum(fakenews.df$frequency)
fakenews.df<-transform(fakenews.df, density=fakenews.density,occurrence=fakenews.occurrence)
fakenews.df <- fakenews.df[order(-fakenews.df$frequency),]
head(fakenews.df,n=30)
realnews.docs<-dir(realnews.path)
realnews.docs<-realnews.docs[which(realnews.docs!="cmds")]
all.realnews<-sapply(realnews.docs, function(p) get.msg(paste(realnews.path,p,sep="/")))
realnews.tdm<-get.tdm(all.realnews)
realnews.matrix <- as.matrix(realnews.tdm)
realnews.counts<-rowSums(realnews.matrix)
realnews.df<-data.frame(cbind(names(realnews.counts),as.numeric(realnews.counts)),stringsAsFactors=FALSE)
names(realnews.df)<-c("term", "frequency")
realnews.df$frequency<-as.numeric(realnews.df$frequency)
realnews.occurrence<-sapply(1:nrow(realnews.matrix), function(i){
length(which(realnews.matrix[i,]>0))/ncol(realnews.matrix)
})
realnews.density<-realnews.df$frequency/sum(realnews.df$frequency)
realnews.df<-transform(realnews.df, density=realnews.density,occurrence=realnews.occurrence)
realnews.df <- realnews.df[order(-realnews.df$frequency),]
head(realnews.df,n=30)
#베이즈 분류 부분 위에서 학습한 결과를 바탕으로 나누어준다
classify.news<-function(path, training.df, prior=0.5, c=1e-6){
msg<-get.msg(path)
msg.tdm<-get.tdm(msg)
msg.freq<-rowSums(as.matrix(msg.tdm))
msg.match<-intersect(names(msg.freq),training.df$term)
if(length(msg.match)<1){
return (prior*c^(length(msg.freq)))
}
else{
match.probs<-training.df$occurrence[match(msg.match, training.df$term)]
return (prior*prod(match.probs)*c^(length(msg.freq)-length(msg.match)))
}
}
factnews.docs<-dir(factnews.path)
factnews.docs<-factnews.docs[which(factnews.docs!="cmds")]
factnews.faketest<-sapply(factnews.docs, function(p) classify.news(paste(factnews.path,p,sep="/"),training.df=fakenews.df))
factnews.realtest<-sapply(factnews.docs, function(p) classify.news(paste(factnews.path,p,sep="/"),training.df=realnews.df))
factnews.res<-ifelse(factnews.faketest >= factnews.realtest,TRUE,FALSE)
summary(factnews.res)
fakenews.classifier <- function(path) {
pr.fakenews <- classify.news(path, fakenews.df)
pr.realnews <- classify.news(path, realnews.df)
return(c(pr.fakenews, pr.realnews, ifelse(pr.fakenews > pr.realnews, 1, 0)))
}
realnews2.docs <- dir(realnews2.path)
realnews2.docs <- realnews2.docs[which(realnews2.docs != "cmds")]
factnews2.docs <- dir(factnews2.path)
factnews2.docs <- factnews2.docs[which(factnews2.docs != "cmds")]
fakenews2.docs <- dir(fakenews2.path)
fakenews2.docs <- fakenews2.docs[which(fakenews2.docs != "cmds")]
realnews2.class <- suppressWarnings(lapply(realnews2.docs,
function(p)
{
fakenews.classifier(file.path(realnews2.path, p))
}))
factnews2.class <- suppressWarnings(lapply(factnews2.docs,
function(p)
{
fakenews.classifier(file.path(factnews2.path, p))
}))
fakenews2.class <- suppressWarnings(lapply(fakenews2.docs,
function(p)
{
fakenews.classifier(file.path(fakenews2.path, p))
}))
realnews2.matrix <- do.call(rbind, realnews2.class)
realnews2.final <- cbind(realnews2.matrix, "REALNEWS")
factnews2.matrix <- do.call(rbind, factnews2.class)
factnews2.final <- cbind(factnews2.matrix, "FACTNEWS")
fakenews2.matrix <- do.call(rbind, fakenews2.class)
fakenews2.final <- cbind(fakenews2.matrix, "FAKENEWS")
class.matrix <- rbind(realnews2.final, factnews2.final, fakenews2.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("Pr.FAKENEWS" ,"Pr.REALNEWS", "Class", "Type")
class.df$Pr.FAKENEWS <- as.numeric(class.df$Pr.FAKENEWS)
class.df$Pr.REALNEWS <- as.numeric(class.df$Pr.REALNEWS)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)
head(class.df)
realnews.False<-subset(class.df, Type=="REALNEWS" & Class=="FALSE")
realnews.FalseCount<-nrow(realnews.False)
realnews.True<-subset(class.df, Type=="REALNEWS" & Class=="TRUE")
realnews.TrueCount<-nrow(realnews.True)
factnews.False<-subset(class.df, Type=="FACTNEWS" & Class=="FALSE")
factnews.FalseCount<-nrow(factnews.False)
factnews.True<-subset(class.df, Type=="FACTNEWS" & Class=="TRUE")
factnews.TrueCount<-nrow(factnews.True)
fakenews.False<-subset(class.df, Type=="FAKENEWS" & Class=="FALSE")
fakenews.FalseCount<-nrow(fakenews.False)
fakenews.True<-subset(class.df, Type=="FAKENEWS" & Class=="TRUE")
fakenews.TrueCount<-nrow(fakenews.True)
realnews.row <- c(realnews.FalseCount, realnews.TrueCount)
factnews.row<-c(factnews.FalseCount, factnews.TrueCount)
fakenews.row<-c(fakenews.FalseCount, fakenews.TrueCount)
allNews<-rbind(realnews.row, factnews.row, fakenews.row)
colnames(allNews) = c("False", "True")
allNews
class.plot <- ggplot(class.df, aes(x = log(Pr.REALNEWS), log(Pr.FAKENEWS))) +
geom_point(aes(shape = Type, alpha = 0.5)) +
geom_abline(intercept = 0, slope = 1) +
scale_shape_manual(values = c("REALNEWS" = 1,
"FACTNEWS" = 2,
"FAKENEWS" = 3),
name = "News Type") +
scale_alpha(guide = "none") +
xlab("log[Pr(REALNEWS)]") +
ylab("log[Pr(FAKENEWS)]") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.text.y = element_blank())
ggsave(plot = class.plot,
filename = file.path("./", "03_final_classification.pdf"),
height = 10,
width = 10)
get.results <- function(bool.vector)
{
results <- c(length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector),
length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector))
return(results)
}
fakenews.classifier<-function(path) {
pr.fakenews<-classify.news(path, fakenews.df, prior=0.2)
pr.realnews<-classify.news(path, fakenews.df, prior=0.8)
return(c(pr.fakenews, pr.realnews, ifelse(pr.fakenews > pr.realnews, 1, 0)))
}
realnews2.col <- get.results(subset(class.df, Type == "REALNEWS")$Class)
factnews2.col <- get.results(subset(class.df, Type == "FACTNEWS")$Class)
fakenews2.col <- get.results(subset(class.df, Type == "FAKENEWS")$Class)
class.res <- rbind(realnews2.col, factnews2.col, fakenews2.col)
colnames(class.res) <- c("NOT FAKENEWS", "FAKENEWS")
print(class.res)