R코딩 베이즈분류기[뉴스판단용]

경환오·2021년 10월 28일
0

학교공부

목록 보기
3/8

R코딩 베이즈분류기 [뉴스판단용]

해커스타일로배우는 기계학습 참고

#라이브러리 불러오기

library(tm)

library(ggplot2)

#경로지정

fakenews.path <- "data/fake_news/"

fakenews2.path <- "data/fake_news_2/"

realnews.path <- "data/real_news/"

realnews2.path <- "data/real_news_2/"

factnews.path <- "data/fact_news/"

factnews2.path <- "data/fact_news_2/"

#get함수 문자열을 얻는 함수 공통으로 사용함

get.msg <- function(path) {

con <- file(path, open="rt", encoding = "UTF-8")

text <- readLines(con)

msg <- text[seq(1,length(text),1)]

close(con)

return(paste(msg, collapse=""))

}

#경로의 파일들에 함수적용


fakenews.docs <- dir(fakenews.path)

all.fakenews <- sapply(fakenews.docs, function(p) get.msg(file.path(fakenews.path,p)))

#gettdm함수

get.tdm<-function(doc.vec){

    doc.corpus<-VCorpus(VectorSource(doc.vec))

    control<-list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)

    doc.dtm<-TermDocumentMatrix(doc.corpus,control)

    return(doc.dtm)

 }

#위의 함수를 이용해서 데이터프레임생성

fakenews.tdm<-get.tdm(all.fakenews)

fakenews.matrix <- as.matrix(fakenews.tdm)

fakenews.counts<-rowSums(fakenews.matrix)

fakenews.df<-data.frame(cbind(names(fakenews.counts),as.numeric(fakenews.counts)),stringsAsFactors=FALSE)

#데이터프레임 속성명 지정


names(fakenews.df)<-c("term", "frequency")

fakenews.df$frequency<-as.numeric(fakenews.df$frequency)



fakenews.occurrence<-sapply(1:nrow(fakenews.matrix), function(i){

     length(which(fakenews.matrix[i,]>0))/ncol(fakenews.matrix)

})



fakenews.density<-fakenews.df$frequency/sum(fakenews.df$frequency)

fakenews.df<-transform(fakenews.df, density=fakenews.density,occurrence=fakenews.occurrence)



fakenews.df <- fakenews.df[order(-fakenews.df$frequency),]

head(fakenews.df,n=30)



realnews.docs<-dir(realnews.path)

realnews.docs<-realnews.docs[which(realnews.docs!="cmds")]

all.realnews<-sapply(realnews.docs, function(p) get.msg(paste(realnews.path,p,sep="/")))

realnews.tdm<-get.tdm(all.realnews)

realnews.matrix <- as.matrix(realnews.tdm)

realnews.counts<-rowSums(realnews.matrix)

realnews.df<-data.frame(cbind(names(realnews.counts),as.numeric(realnews.counts)),stringsAsFactors=FALSE)

names(realnews.df)<-c("term", "frequency")

realnews.df$frequency<-as.numeric(realnews.df$frequency)



realnews.occurrence<-sapply(1:nrow(realnews.matrix), function(i){

     length(which(realnews.matrix[i,]>0))/ncol(realnews.matrix)

})



realnews.density<-realnews.df$frequency/sum(realnews.df$frequency)

realnews.df<-transform(realnews.df, density=realnews.density,occurrence=realnews.occurrence)



realnews.df <- realnews.df[order(-realnews.df$frequency),]

head(realnews.df,n=30)

#베이즈 분류 부분 위에서 학습한 결과를 바탕으로 나누어준다

classify.news<-function(path, training.df, prior=0.5, c=1e-6){

    msg<-get.msg(path)

    msg.tdm<-get.tdm(msg)

    msg.freq<-rowSums(as.matrix(msg.tdm))

    msg.match<-intersect(names(msg.freq),training.df$term)

    if(length(msg.match)<1){

       return (prior*c^(length(msg.freq)))

    }

    else{

       match.probs<-training.df$occurrence[match(msg.match, training.df$term)]

       return (prior*prod(match.probs)*c^(length(msg.freq)-length(msg.match)))

    }

 }



factnews.docs<-dir(factnews.path)

factnews.docs<-factnews.docs[which(factnews.docs!="cmds")]

factnews.faketest<-sapply(factnews.docs, function(p) classify.news(paste(factnews.path,p,sep="/"),training.df=fakenews.df))

factnews.realtest<-sapply(factnews.docs, function(p) classify.news(paste(factnews.path,p,sep="/"),training.df=realnews.df))

factnews.res<-ifelse(factnews.faketest >= factnews.realtest,TRUE,FALSE)

summary(factnews.res)





fakenews.classifier <- function(path) {

pr.fakenews <- classify.news(path, fakenews.df)

pr.realnews <- classify.news(path, realnews.df)

return(c(pr.fakenews, pr.realnews, ifelse(pr.fakenews > pr.realnews, 1, 0)))

}



realnews2.docs <- dir(realnews2.path)

realnews2.docs <- realnews2.docs[which(realnews2.docs != "cmds")]



factnews2.docs <- dir(factnews2.path)

factnews2.docs <- factnews2.docs[which(factnews2.docs != "cmds")]



fakenews2.docs <- dir(fakenews2.path)

fakenews2.docs <- fakenews2.docs[which(fakenews2.docs != "cmds")]



realnews2.class <- suppressWarnings(lapply(realnews2.docs,

  function(p)

  {

   fakenews.classifier(file.path(realnews2.path, p))

  }))

factnews2.class <- suppressWarnings(lapply(factnews2.docs,

  function(p)

  {

   fakenews.classifier(file.path(factnews2.path, p))

  }))

fakenews2.class <- suppressWarnings(lapply(fakenews2.docs,

  function(p)

  {

    fakenews.classifier(file.path(fakenews2.path, p))

  }))



realnews2.matrix <- do.call(rbind, realnews2.class)

realnews2.final <- cbind(realnews2.matrix, "REALNEWS")



factnews2.matrix <- do.call(rbind, factnews2.class)

factnews2.final <- cbind(factnews2.matrix, "FACTNEWS")



fakenews2.matrix <- do.call(rbind, fakenews2.class)

fakenews2.final <- cbind(fakenews2.matrix, "FAKENEWS")



class.matrix <- rbind(realnews2.final, factnews2.final, fakenews2.final)

class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)

names(class.df) <- c("Pr.FAKENEWS" ,"Pr.REALNEWS", "Class", "Type")

class.df$Pr.FAKENEWS <- as.numeric(class.df$Pr.FAKENEWS)

class.df$Pr.REALNEWS <- as.numeric(class.df$Pr.REALNEWS)

class.df$Class <- as.logical(as.numeric(class.df$Class))

class.df$Type <- as.factor(class.df$Type)



head(class.df)



realnews.False<-subset(class.df, Type=="REALNEWS" & Class=="FALSE")

realnews.FalseCount<-nrow(realnews.False)

realnews.True<-subset(class.df, Type=="REALNEWS" & Class=="TRUE")

realnews.TrueCount<-nrow(realnews.True)



factnews.False<-subset(class.df, Type=="FACTNEWS" & Class=="FALSE")

factnews.FalseCount<-nrow(factnews.False)

factnews.True<-subset(class.df, Type=="FACTNEWS" & Class=="TRUE")

factnews.TrueCount<-nrow(factnews.True)



fakenews.False<-subset(class.df, Type=="FAKENEWS" & Class=="FALSE")

fakenews.FalseCount<-nrow(fakenews.False)

fakenews.True<-subset(class.df, Type=="FAKENEWS" & Class=="TRUE")

fakenews.TrueCount<-nrow(fakenews.True)



realnews.row <- c(realnews.FalseCount, realnews.TrueCount)

factnews.row<-c(factnews.FalseCount, factnews.TrueCount)

fakenews.row<-c(fakenews.FalseCount, fakenews.TrueCount)

allNews<-rbind(realnews.row, factnews.row, fakenews.row)

colnames(allNews) = c("False", "True")



allNews



class.plot <- ggplot(class.df, aes(x = log(Pr.REALNEWS), log(Pr.FAKENEWS))) +

    geom_point(aes(shape = Type, alpha = 0.5)) +

    geom_abline(intercept = 0, slope = 1) +

    scale_shape_manual(values = c("REALNEWS" = 1,

                                  "FACTNEWS" = 2,

                                  "FAKENEWS" = 3),

                       name = "News Type") +

    scale_alpha(guide = "none") +

    xlab("log[Pr(REALNEWS)]") +

    ylab("log[Pr(FAKENEWS)]") +

    theme_bw() +

    theme(axis.text.x = element_blank(), axis.text.y = element_blank())

ggsave(plot = class.plot,

       filename = file.path("./", "03_final_classification.pdf"),

       height = 10,

       width = 10)

get.results <- function(bool.vector)

{

         results <- c(length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector),

              length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector))

 return(results)

}





fakenews.classifier<-function(path) {

pr.fakenews<-classify.news(path, fakenews.df, prior=0.2)

pr.realnews<-classify.news(path, fakenews.df, prior=0.8)

return(c(pr.fakenews, pr.realnews, ifelse(pr.fakenews > pr.realnews, 1, 0)))

}



realnews2.col <- get.results(subset(class.df, Type == "REALNEWS")$Class)

factnews2.col <- get.results(subset(class.df, Type == "FACTNEWS")$Class)

fakenews2.col <- get.results(subset(class.df, Type == "FAKENEWS")$Class)



class.res <- rbind(realnews2.col, factnews2.col, fakenews2.col)

colnames(class.res) <- c("NOT FAKENEWS", "FAKENEWS")

print(class.res)
profile
방문해주셔서 감사합니다!

0개의 댓글