Author : tmlab / Date : 2016. 10. 26. 20:35 / Category : Text Mining/R
rcv<-function (x)
{
for (i in x) {
if (!is.element(i, .packages(all.available = TRUE))) {
install.packages(i)
}
library(i, character.only = TRUE)
}
}
rcv(c("twitteR","SnowballC","tm"))
## Loading required package: NLP
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
## [1] "Using direct authentication"
tweets <- userTimeline("RDataMining",n=3200)
(n.tweet <- length(tweets))
## [1] 466
tweets[1:5]
## [[1]]
## [1] "RDataMining: Seminar: Exploring causal relationships in observational data, Prof. Jiuyong Li. Canberra, 4:15pm Wed 16 Nov 2016. https://t.co/tXBWBfv01J"
##
## [[2]]
## [1] "RDataMining: Three Research Scholarships (PhD or Research Master) in Data Science & Analytics, based in Canberra. Apply by 24 Oct https://t.co/CUj6IRoWzg"
##
## [[3]]
## [1] "RDataMining: Slides and other materials for the R and Data Mining Short Course at University of Canberra are now available at https://t.co/xKwtcnvjj7"
##
## [[4]]
## [1] "RDataMining: Canberra HealthHack 2016, 14th - 16th October https://t.co/YJiAKCODdJ"
##
## [[5]]
## [1] "RDataMining: @AliMAllaith sorry, seems not. Please check the link for details."
tweets.df <- twListToDF(tweets)
str(tweets.df)
## 'data.frame': 466 obs. of 16 variables:
## $ text : chr "Seminar: Exploring causal relationships in observational data, Prof. Jiuyong Li. Canberra, 4:15pm Wed 16 Nov 2016. https://t.co"| __truncated__ "Three Research Scholarships (PhD or Research Master) in Data Science & Analytics, based in Canberra. Apply by 24 Oct https:"| __truncated__ "Slides and other materials for the R and Data Mining Short Course at University of Canberra are now available at https://t.co/x"| __truncated__ "Canberra HealthHack 2016, 14th - 16th October https://t.co/YJiAKCODdJ" ...
## $ favorited : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ favoriteCount: num 3 2 10 0 0 3 1 0 4 8 ...
## $ replyToSN : chr NA NA NA NA ...
## $ created : POSIXct, format: "2016-10-26 08:36:01" "2016-10-17 14:01:54" ...
## $ truncated : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ replyToSID : chr NA NA NA NA ...
## $ id : chr "791196673649233920" "788017193019518976" "783785179684798464" "781420267885072384" ...
## $ replyToUID : chr NA NA NA NA ...
## $ statusSource : chr "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" ...
## $ screenName : chr "RDataMining" "RDataMining" "RDataMining" "RDataMining" ...
## $ retweetCount : num 1 2 6 1 0 1 1 0 1 7 ...
## $ isRetweet : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ retweeted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ longitude : logi NA NA NA NA NA NA ...
## $ latitude : logi NA NA NA NA NA NA ...
myCorpus <- Corpus(VectorSource(tweets.df$text))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
removeURL <- function(x)gsub("http[[:alnum:]]*","",x)
myCorpus <- tm_map(myCorpus, removeURL)
myStopwords <- c(stopwords("english"),"available","via")
myStopwords <- setdiff(myStopwords,c("r","big"))
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
myCorpus <- tm_map(myCorpus, stripWhitespace)
myCorpus <- tm_map(myCorpus, PlainTextDocument)
myCorpusCopy <- myCorpus
for ( i in 1:10 ){
cat(paste("[[",i,"]]",sep=""," "))
writeLines(as.character(myCorpus[[i]]))
}
## [[1]] seminar exploring causal relationships observational data prof jiuyong li canberra pm wed nov
## [[2]] three research scholarships phd research master data science amp analytics based canberra apply oct
## [[3]] slides materials r data mining short course university canberra now
## [[4]] canberra healthhack th th october
## [[5]] alimallaith sorry seems please check link details
## [[6]] phd scholarships data science analytics canberra australia
## [[7]] free halfday short course r data mining university canberra ampm fri oct seats limited
## [[8]] hamishbr will travel canberra future interested giving talk canberra data scientists meetup thanks
## [[9]] getting started apache spark free ebook
## [[10]] using natural language processing nontextual data mllib presentation hadoop summit melbourne
myCorpus1 <- tm_map(myCorpus,stemDocument)
for ( i in 1:10 ){
cat(paste("[[",i,"]]",sep=""))
writeLines(as.character(myCorpus1[[i]]))
}
## [[1]]seminar explor causal relationship observ data prof jiuyong li canberra pm wed nov
## [[2]]three research scholarship phd research master data scienc amp analyt base canberra appli oct
## [[3]]slide materi r data mine short cours univers canberra now
## [[4]]canberra healthhack th th octob
## [[5]]alimallaith sorri seem pleas check link detail
## [[6]]phd scholarship data scienc analyt canberra australia
## [[7]]free halfday short cours r data mine univers canberra ampm fri oct seat limit
## [[8]]hamishbr will travel canberra futur interest give talk canberra data scientist meetup thank
## [[9]]get start apach spark free ebook
## [[10]]use natur languag process nontextu data mllib present hadoop summit melbourn
stemCompletion_mod <- function(x,dict=dictCorpus) {
PlainTextDocument(stripWhitespace(paste(stemCompletion(unlist(strsplit(as.character(x)," ")),dictionary=dict, type="shortest"),sep="", collapse=" ")))
}
myCorpus2<-NULL
for (i in 1:n.tweet){
myCorpus2[[i]]<-stemCompletion_mod(myCorpus1[[i]],myCorpusCopy)
}
myCorpus2 <- Corpus(VectorSource(myCorpus2))
for ( i in 1:10 ){
cat("변환 전: ",as.character(myCorpus[[i]]),"\n")
cat("변환 후: ",as.character(myCorpus2[[i]]),"\n")
print("------------")
}
## 변환 전: seminar exploring causal relationships observational data prof jiuyong li canberra pm wed nov
## 변환 후: seminar exploring causal relationships observational data prof jiuyong li canberra pm wed nov
## [1] "------------"
## 변환 전: three research scholarships phd research master data science amp analytics based canberra apply oct
## 변환 후: three research scholarships phd research master data scienc amp analytics based canberra applied oct
## [1] "------------"
## 변환 전: slides materials r data mining short course university canberra now
## 변환 후: slide materials r data miner short course university canberra now
## [1] "------------"
## 변환 전: canberra healthhack th th october
## 변환 후: canberra healthhack th th october
## [1] "------------"
## 변환 전: alimallaith sorry seems please check link details
## 변환 후: alimallaith NA seems please check link details
## [1] "------------"
## 변환 전: phd scholarships data science analytics canberra australia
## 변환 후: phd scholarships data scienc analytics canberra australia
## [1] "------------"
## 변환 전: free halfday short course r data mining university canberra ampm fri oct seats limited
## 변환 후: free halfday short course r data miner university canberra ampm fri oct seats limit
## [1] "------------"
## 변환 전: hamishbr will travel canberra future interested giving talk canberra data scientists meetup thanks
## 변환 후: hamishbr will travel canberra future interested give talk canberra data scientist meetup thank
## [1] "------------"
## 변환 전: getting started apache spark free ebook
## 변환 후: get start apache spark free ebook
## [1] "------------"
## 변환 전: using natural language processing nontextual data mllib presentation hadoop summit melbourne
## 변환 후: use natural language process nontextual data mllib presenting hadoop summit melbourne
## [1] "------------"