Author : tmlab / Date : 2017. 12. 29. 18:53 / Category : Text Mining/Python
import pandas as pd
q = pd.read_csv('C:/q_title_utf.csv')
q.head()
title = list(q.title)
title[0:5]
from string import punctuation
def strip_punctuation(s):
return ''.join(c for c in s if c not in punctuation)
clean_title = []
for sent in title :
clean = strip_punctuation(sent)
clean_title.append(clean)
clean_title[0:3]
from konlpy.tag import Twitter
twitter = Twitter()
twitter.nouns(clean_title[0])
twitter.pos(clean_title[0])
w2v_data = []
def tokenize(data):
for sent in data:
tokens = twitter.pos(sent)
new_tokens = []
for token in tokens:
new_token = token[0]+'/'+token[1]
new_tokens.append(new_token)
w2v_data.append(new_tokens)
return(w2v_data[0:2])
tokenize(clean_title)
import gensim
### from gensim.models import word2vec
num_features = 300 # Word vector dimensionality
min_word_count = 10 # Minimum word count
num_workers = 2 # Number of threads to run in parallel
context = 4 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
model = gensim.models.Word2Vec(w2v_data, workers=num_workers,
size=num_features, min_count = min_word_count,
window = context, sample = downsampling)
model_name = "feature100_context2"
model.save(model_name)
model.most_similar("이혼/Noun", topn=10)
model.most_similar("상속/Noun", topn=10)
model.most_similar("폭행/Noun", topn=10)
model.most_similar("사기/Noun", topn=10)
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
def tsne_plot(model):
labels = []
tokens = []
for word in model.wv.vocab:
tokens.append(model[word])
labels.append(word)
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(tokens)
x = []
y = []
for value in new_values:
x.append(value[0])
y.append(value[1])
plt.figure(figsize=(16, 16))
for i in range(len(x)):
plt.scatter(x[i],y[i])
plt.annotate(labels[i],
xy=(x[i], y[i]),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.show()
tsne_plot(model)
font_name = "c:/Users/ksg/AppData/Local/Programs/Python/Python36/Lib/site-packages/matplotlib/mpl-data/fonts/ttf/H2GTRM.ttf"
font_name = font_manager.FontProperties(fname=font_name).get_name()
rc('font', family=font_name)