[텍스트마이닝] TM - Term Document Matrix

Author : tmlab / Date : 2016. 9. 27. 21:15 / Category : Text Mining/R

tm패키지 + igraph 사용법에 대해서 공부해보자!

예재 데이터

setwd("D:/study/R/tm_package")
data1 = readLines('tm_test.txt')
## Warning in readLines("tm_test.txt"): 'tm_test.txt'에서 불완전한 마지막 행이
## 발견되었습니다
data1
## [1] "I like apple and banana , but hate cherry"
## [2] "I love banana , but not mango"            
## [3] "I hate peach , but like cherry"           
## [4] "I want to eat grape ~! "
class(data1)
## [1] "character"

tm 패키지 사용하기

library(tm)
## Warning: package 'tm' was built under R version 3.2.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.2
corp = Corpus(VectorSource(data1)) #코퍼스는 tm패키지에서 처리하는 데이터의 유형
corp #코퍼스 = 말뭉치를 의미함(4개의 문서가 들어있음)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 4
inspect(corp)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 4
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 41
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 29
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 30
## 
## [[4]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 23

TermDocumentMatrix(해당문서에 단어발생유무를 0,1로 코딩한 매트릭스)

tdm <- TermDocumentMatrix(corp) ; tdm
## <<TermDocumentMatrix (terms: 14, documents: 4)>>
## Non-/sparse entries: 20/36
## Sparsity           : 64%
## Maximal term length: 6
## Weighting          : term frequency (tf)
m = as.matrix(tdm) ; m
##         Docs
## Terms    1 2 3 4
##   and    1 0 0 0
##   apple  1 0 0 0
##   banana 1 1 0 0
##   but    1 1 1 0
##   cherry 1 0 1 0
##   eat    0 0 0 1
##   grape  0 0 0 1
##   hate   1 0 1 0
##   like   1 0 1 0
##   love   0 1 0 0
##   mango  0 1 0 0
##   not    0 1 0 0
##   peach  0 0 1 0
##   want   0 0 0 1

특정 글자수 이상의 워드만으로 매트릭스만들때

corp2 <- TermDocumentMatrix(corp, control=list(wordLengths=c(1,Inf)))

텍스트 데이터 정리(공백, 기호, 불용어)

corp2 <- tm_map(corp,stripWhitespace) # 여러개의 공백을 하나의 공백으로 변환
corp2 <- tm_map(corp2,tolower) # 대문자가 있을 경우 소문자로 변환
corp2 <- tm_map(corp2,removeNumbers) # 숫자 제거
corp2 <- tm_map(corp2,removePunctuation) # 특수문자제거
corp2 <- tm_map(corp2,PlainTextDocument)

sword2 <- c(stopwords('en'),"and","but","not") # 기본 불용어 외에 불용어로 쓸 단어 추가
corp2 <- tm_map(corp2,removeWords,sword2) # 불용어 제거하기 (전치사 , 관사 등)

tdm2 <- TermDocumentMatrix(corp2)
tdm2
## <<TermDocumentMatrix (terms: 11, documents: 4)>>
## Non-/sparse entries: 15/29
## Sparsity           : 66%
## Maximal term length: 6
## Weighting          : term frequency (tf)

다시 텀다큐먼트메트릭스

tdm2 <- TermDocumentMatrix(corp2) ; tdm2
## <<TermDocumentMatrix (terms: 11, documents: 4)>>
## Non-/sparse entries: 15/29
## Sparsity           : 66%
## Maximal term length: 6
## Weighting          : term frequency (tf)
m2 <- as.matrix(tdm2) ; m2
##         Docs
## Terms    character(0) character(0) character(0) character(0)
##   apple             1            0            0            0
##   banana            1            1            0            0
##   cherry            1            0            1            0
##   eat               0            0            0            1
##   grape             0            0            0            1
##   hate              1            0            1            0
##   like              1            0            1            0
##   love              0            1            0            0
##   mango             0            1            0            0
##   peach             0            0            1            0
##   want              0            0            0            1

워드의 출처(어떤문서에서 나온 워드인가)가 사라져서 컬럼지정해줌

colnames(m2) <- c(1:4)
m2
##         Docs
## Terms    1 2 3 4
##   apple  1 0 0 0
##   banana 1 1 0 0
##   cherry 1 0 1 0
##   eat    0 0 0 1
##   grape  0 0 0 1
##   hate   1 0 1 0
##   like   1 0 1 0
##   love   0 1 0 0
##   mango  0 1 0 0
##   peach  0 0 1 0
##   want   0 0 0 1

단어의 빈도수 파악

findFreqTerms(tdm2,2) #특정 빈도의 워드를 찾아줌
## [1] "banana" "cherry" "hate"   "like"
sort(rowSums(m2),decreasing=T)
## banana cherry   hate   like  apple    eat  grape   love  mango  peach 
##      2      2      2      2      1      1      1      1      1      1 
##   want 
##      1
sort(colSums(m2),decreasing=T)
## 1 3 2 4 
## 5 4 3 3

단어의 연관성 파악

findAssocs(tdm2,"apple",0.5) # 애플과 상관이 0.5이상인 단어들을 찾아줌
## $apple
## banana cherry   hate   like 
##   0.58   0.58   0.58   0.58
findAssocs(tdm2,"apple",0.6)
## $apple
## numeric(0)

igraph 사용방법

m2 ; t(m2) #텀다큐먼트매트릭스 전치시킴
##         Docs
## Terms    1 2 3 4
##   apple  1 0 0 0
##   banana 1 1 0 0
##   cherry 1 0 1 0
##   eat    0 0 0 1
##   grape  0 0 0 1
##   hate   1 0 1 0
##   like   1 0 1 0
##   love   0 1 0 0
##   mango  0 1 0 0
##   peach  0 0 1 0
##   want   0 0 0 1
##     Terms
## Docs apple banana cherry eat grape hate like love mango peach want
##    1     1      1      1   0     0    1    1    0     0     0    0
##    2     0      1      0   0     0    0    0    1     1     0    0
##    3     0      0      1   0     0    1    1    0     0     1    0
##    4     0      0      0   1     1    0    0    0     0     0    1
adjmatrix <- m2 %*% t(m2) # 두 행렬을 곱하여 인접행렬을 생성
adjmatrix
##         Terms
## Terms    apple banana cherry eat grape hate like love mango peach want
##   apple      1      1      1   0     0    1    1    0     0     0    0
##   banana     1      2      1   0     0    1    1    1     1     0    0
##   cherry     1      1      2   0     0    2    2    0     0     1    0
##   eat        0      0      0   1     1    0    0    0     0     0    1
##   grape      0      0      0   1     1    0    0    0     0     0    1
##   hate       1      1      2   0     0    2    2    0     0     1    0
##   like       1      1      2   0     0    2    2    0     0     1    0
##   love       0      1      0   0     0    0    0    1     1     0    0
##   mango      0      1      0   0     0    0    0    1     1     0    0
##   peach      0      0      1   0     0    1    1    0     0     1    0
##   want       0      0      0   1     1    0    0    0     0     0    1
library(igraph)
## Warning: package 'igraph' was built under R version 3.2.2
## 
## Attaching package: 'igraph'
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union
g1 <- graph.adjacency(adjmatrix, weighted=T, mode="undirected")
g1
## IGRAPH UNW- 11 30 -- 
## + attr: name (v/c), weight (e/n)
## + edges (vertex names):
##  [1] apple --apple  apple --banana apple --cherry apple --hate  
##  [5] apple --like   banana--banana banana--cherry banana--hate  
##  [9] banana--like   banana--love   banana--mango  cherry--cherry
## [13] cherry--hate   cherry--like   cherry--peach  eat   --eat   
## [17] eat   --grape  eat   --want   grape --grape  grape --want  
## [21] hate  --hate   hate  --like   hate  --peach  like  --like  
## [25] like  --peach  love  --love   love  --mango  mango --mango 
## [29] peach --peach  want  --want
plot(g1)

루프를 제거하는 함수가 simplify( ) 함수

g2 <- simplify(g1)
plot(g2)

회수에 따라 가중치 반영하여 크기를 다르게 출력

정점은 해당 꼭지점이고 대문자 V 로 표시

엣지는 그 꼭지점에 연결된 선으로 대문자 E 로 표시합

각 정점별로 엣지수를 파악하는 명령어가 degree( ) 함수

degree(g2)
##  apple banana cherry    eat  grape   hate   like   love  mango  peach 
##      4      6      5      2      2      5      5      2      2      3 
##   want 
##      2
V(g2)$degree <- degree(g2)
V(g2)$label.cex <-3*(V(g2)$degree / max(V(g2)$degree))
V(g2)$size <- 10*(V(g2)$degree / max(V(g2)$degree))
E(g2)$width <- 2*(E(g2)$weight / max(E(g2)$weight))
plot(g2)

V(g2)$degree
##  [1] 4 6 5 2 2 5 5 2 2 3 2
max(V(g2)$degree)
## [1] 6
max(E(g2)$weight) 
## [1] 2


Archives

05-16 00:52

Contact Us

Address
경기도 수원시 영통구 원천동 산5번지 아주대학교 다산관 429호

E-mail
textminings@gmail.com

Phone
031-219-2910

Tags

Calendar

«   2024/05   »
1 2 3 4
5 6 7 8 9 10 11
12 13 14 15 16 17 18
19 20 21 22 23 24 25
26 27 28 29 30 31
Copyright © All Rights Reserved
Designed by CMSFactory.NET