Author : tmlab / Date : 2016. 9. 27. 21:15 / Category : Text Mining/R
setwd("D:/study/R/tm_package")
data1 = readLines('tm_test.txt')
## Warning in readLines("tm_test.txt"): 'tm_test.txt'에서 불완전한 마지막 행이
## 발견되었습니다
data1
## [1] "I like apple and banana , but hate cherry"
## [2] "I love banana , but not mango"
## [3] "I hate peach , but like cherry"
## [4] "I want to eat grape ~! "
class(data1)
## [1] "character"
library(tm)
## Warning: package 'tm' was built under R version 3.2.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.2
corp = Corpus(VectorSource(data1)) #코퍼스는 tm패키지에서 처리하는 데이터의 유형
corp #코퍼스 = 말뭉치를 의미함(4개의 문서가 들어있음)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 4
inspect(corp)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 4
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 41
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 29
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 30
##
## [[4]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 23
tdm <- TermDocumentMatrix(corp) ; tdm
## <<TermDocumentMatrix (terms: 14, documents: 4)>>
## Non-/sparse entries: 20/36
## Sparsity : 64%
## Maximal term length: 6
## Weighting : term frequency (tf)
m = as.matrix(tdm) ; m
## Docs
## Terms 1 2 3 4
## and 1 0 0 0
## apple 1 0 0 0
## banana 1 1 0 0
## but 1 1 1 0
## cherry 1 0 1 0
## eat 0 0 0 1
## grape 0 0 0 1
## hate 1 0 1 0
## like 1 0 1 0
## love 0 1 0 0
## mango 0 1 0 0
## not 0 1 0 0
## peach 0 0 1 0
## want 0 0 0 1
corp2 <- TermDocumentMatrix(corp, control=list(wordLengths=c(1,Inf)))
corp2 <- tm_map(corp,stripWhitespace) # 여러개의 공백을 하나의 공백으로 변환
corp2 <- tm_map(corp2,tolower) # 대문자가 있을 경우 소문자로 변환
corp2 <- tm_map(corp2,removeNumbers) # 숫자 제거
corp2 <- tm_map(corp2,removePunctuation) # 특수문자제거
corp2 <- tm_map(corp2,PlainTextDocument)
sword2 <- c(stopwords('en'),"and","but","not") # 기본 불용어 외에 불용어로 쓸 단어 추가
corp2 <- tm_map(corp2,removeWords,sword2) # 불용어 제거하기 (전치사 , 관사 등)
tdm2 <- TermDocumentMatrix(corp2)
tdm2
## <<TermDocumentMatrix (terms: 11, documents: 4)>>
## Non-/sparse entries: 15/29
## Sparsity : 66%
## Maximal term length: 6
## Weighting : term frequency (tf)
tdm2 <- TermDocumentMatrix(corp2) ; tdm2
## <<TermDocumentMatrix (terms: 11, documents: 4)>>
## Non-/sparse entries: 15/29
## Sparsity : 66%
## Maximal term length: 6
## Weighting : term frequency (tf)
m2 <- as.matrix(tdm2) ; m2
## Docs
## Terms character(0) character(0) character(0) character(0)
## apple 1 0 0 0
## banana 1 1 0 0
## cherry 1 0 1 0
## eat 0 0 0 1
## grape 0 0 0 1
## hate 1 0 1 0
## like 1 0 1 0
## love 0 1 0 0
## mango 0 1 0 0
## peach 0 0 1 0
## want 0 0 0 1
colnames(m2) <- c(1:4)
m2
## Docs
## Terms 1 2 3 4
## apple 1 0 0 0
## banana 1 1 0 0
## cherry 1 0 1 0
## eat 0 0 0 1
## grape 0 0 0 1
## hate 1 0 1 0
## like 1 0 1 0
## love 0 1 0 0
## mango 0 1 0 0
## peach 0 0 1 0
## want 0 0 0 1
findFreqTerms(tdm2,2) #특정 빈도의 워드를 찾아줌
## [1] "banana" "cherry" "hate" "like"
sort(rowSums(m2),decreasing=T)
## banana cherry hate like apple eat grape love mango peach
## 2 2 2 2 1 1 1 1 1 1
## want
## 1
sort(colSums(m2),decreasing=T)
## 1 3 2 4
## 5 4 3 3
findAssocs(tdm2,"apple",0.5) # 애플과 상관이 0.5이상인 단어들을 찾아줌
## $apple
## banana cherry hate like
## 0.58 0.58 0.58 0.58
findAssocs(tdm2,"apple",0.6)
## $apple
## numeric(0)
m2 ; t(m2) #텀다큐먼트매트릭스 전치시킴
## Docs
## Terms 1 2 3 4
## apple 1 0 0 0
## banana 1 1 0 0
## cherry 1 0 1 0
## eat 0 0 0 1
## grape 0 0 0 1
## hate 1 0 1 0
## like 1 0 1 0
## love 0 1 0 0
## mango 0 1 0 0
## peach 0 0 1 0
## want 0 0 0 1
## Terms
## Docs apple banana cherry eat grape hate like love mango peach want
## 1 1 1 1 0 0 1 1 0 0 0 0
## 2 0 1 0 0 0 0 0 1 1 0 0
## 3 0 0 1 0 0 1 1 0 0 1 0
## 4 0 0 0 1 1 0 0 0 0 0 1
adjmatrix <- m2 %*% t(m2) # 두 행렬을 곱하여 인접행렬을 생성
adjmatrix
## Terms
## Terms apple banana cherry eat grape hate like love mango peach want
## apple 1 1 1 0 0 1 1 0 0 0 0
## banana 1 2 1 0 0 1 1 1 1 0 0
## cherry 1 1 2 0 0 2 2 0 0 1 0
## eat 0 0 0 1 1 0 0 0 0 0 1
## grape 0 0 0 1 1 0 0 0 0 0 1
## hate 1 1 2 0 0 2 2 0 0 1 0
## like 1 1 2 0 0 2 2 0 0 1 0
## love 0 1 0 0 0 0 0 1 1 0 0
## mango 0 1 0 0 0 0 0 1 1 0 0
## peach 0 0 1 0 0 1 1 0 0 1 0
## want 0 0 0 1 1 0 0 0 0 0 1
library(igraph)
## Warning: package 'igraph' was built under R version 3.2.2
##
## Attaching package: 'igraph'
##
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
##
## The following object is masked from 'package:base':
##
## union
g1 <- graph.adjacency(adjmatrix, weighted=T, mode="undirected")
g1
## IGRAPH UNW- 11 30 --
## + attr: name (v/c), weight (e/n)
## + edges (vertex names):
## [1] apple --apple apple --banana apple --cherry apple --hate
## [5] apple --like banana--banana banana--cherry banana--hate
## [9] banana--like banana--love banana--mango cherry--cherry
## [13] cherry--hate cherry--like cherry--peach eat --eat
## [17] eat --grape eat --want grape --grape grape --want
## [21] hate --hate hate --like hate --peach like --like
## [25] like --peach love --love love --mango mango --mango
## [29] peach --peach want --want
plot(g1)
g2 <- simplify(g1)
plot(g2)
degree(g2)
## apple banana cherry eat grape hate like love mango peach
## 4 6 5 2 2 5 5 2 2 3
## want
## 2
V(g2)$degree <- degree(g2)
V(g2)$label.cex <-3*(V(g2)$degree / max(V(g2)$degree))
V(g2)$size <- 10*(V(g2)$degree / max(V(g2)$degree))
E(g2)$width <- 2*(E(g2)$weight / max(E(g2)$weight))
plot(g2)
V(g2)$degree
## [1] 4 6 5 2 2 5 5 2 2 3 2
max(V(g2)$degree)
## [1] 6
max(E(g2)$weight)
## [1] 2