Author : tmlab / Date : 2016. 10. 18. 17:40 / Category : Text Mining/R
library(readxl)
blog<- read_excel("after_POS.xlsx")
str(blog)
blog_test<-blog[complete.cases(blog$article_POS),]
library(tm)
library(stringr)
library(rJava)
ko.words = function(doc){
d = str_split(doc, ';')[[1]] ## 띄어쓰기(' ')를 기준으로 한 문장을 여러 단어로 나눔
extracted = tolower(str_match(d, '([가-힣a-zA-Z]+)/[NVO]'))
extracted2 = tolower(str_match(d,'([가-힣]+)/XR'))
keyword = extracted[,2]
keyword2 = extracted2[,2]
keyword<-c(keyword,keyword2)
keyword[!is.na(keyword)]
}
options(mc.cores=1)
cps = Corpus(VectorSource(blog_test$article_POS))
tdm <- TermDocumentMatrix(cps, control=list(tokenize=ko.words,
wordLengths=c(2, Inf)))
dtm <- as.DocumentTermMatrix(tdm)
head(as.matrix(dtm))
set.seed(227)
library(topicmodels)
ldaform<-dtm2ldaformat(dtm, omit_empty = F)
library(lda)
start = Sys.time()
result.lda<-lda.collapsed.gibbs.sampler(ldaform$documents,
K = 10,
vocab = ldaform$vocab,
num.iterations = 80000,
burnin = 25000,
alpha = 0.01,
eta = 0.01)
end <- Sys.time()
end-start
top.topic.words(result.lda$topics,20,by.score=T)
top_topic<-top.topic.words(result.lda$topics,20,by.score = T)
theta = rowSums(result.lda$document_sums)
## 비율로 변환 및 추가
topic.proportion = theta/sum(theta)
####토픽내에 단어별 출현확률(비율)구하기(여기서 상위개수 지정한거에 따라서 함수식 변환해주기)
top.words = top.topic.words(result.lda$topics, 20)#상위 n개 지정
c_top.words<-as.character(top.words)#캐릭터로 바꾸기
new.topics<-subset(result.lda$topics,select=c_top.words)#top.words에 해당하는 토픽 뽑기
count_by_words<-new.topics
a=1
k=0
for(j in 1:ncol(new.topics))#count 모아주는 함수
{
if(a*20+1==j)
{
k=20*a
a=a+1
}
count_by_words[a,j-k]<-count_by_words[a,j]
}
proportion_by_words<-t(count_by_words[,1:20]/as.integer(result.lda$topic_sums))#비율구하기
result<-matrix(paste(top.topic.words(result.lda$topics, 20),"(",proportion_by_words,")"),byrow=F,nrow=20)#단어랑 비율 paste해서 행렬만들기
output<-rbind(result,topic.proportion,t(result.lda$topic_sums))
trends<-as.data.frame(t(result.lda$document_sums),stringsAsFactors=F)
trends
blog$date
date<-unlist(strsplit(blog$date," "))[1:length(blog$date)%%2==1]
head(date)
str(date)
date<-as.Date(date,format("20%y.%m.%d"))
str(date)
date<-format(date,"%Y-%m")
date
blog$date<-date
tot<-rowSums(trends)
trends
str(trends)
trends<-trends/tot
for(i in 1:ncol(trends)){
trends[,i][is.nan(trends[,i])]<-0
}
trends$date<-blog$date
colnames(trends)<-c(paste("Topic",1:10),"date")
str(trends)
trends_month<-aggregate(trends,list(trends$date),FUN=mean)
trends_month
trends_month$date<-NULL
str(trends_month)
library(ggplot2)
p<-ggplot(trends_month,aes(x=Group.1))
trends_plot<-p+geom_line(aes(y=`Topic 1`,group=1),color="red")+
geom_line(aes(y=`Topic 2`,group=1),color="brown")+
geom_line(aes(y=`Topic 3`,group=1),color="blue")+
geom_line(aes(y=`Topic 4`,group=1),color="yellow")+
geom_line(aes(y=`Topic 5`,group=1),color="green")+
geom_line(aes(y=`Topic 6`,group=2),color="orange")+
geom_line(aes(y=`Topic 7`,group=2),color="purple")+
geom_line(aes(y=`Topic 8`,group=2),color="aquamarine1")+
geom_line(aes(y=`Topic 9`,group=2),color="black")+
geom_line(aes(y=`Topic 10`,group=2),color="cyan")+
ylab("Topics") + xlab("")+ ggtitle("Topic Trends")
trends_plot