Author : tmlab / Date : 2017. 12. 29. 21:36 / Category : Text Mining/Python
import os
os.chdir("/home/ajoumis2/quara/csv")
import numpy as np
import pandas as pd
raw = pd.read_csv('stop_words_data.csv', header=0)
len(raw)
raw.isnull().sum().sum()
raw[pd.isnull(rwa.index)]
q1_list = list(raw['question1'])
q2_list = list(raw['question2'])
q_list = q1_list + q2_list
len(q_list)
w2v_input = []
for w2v_sentence in q_list:
w2v_wordlist = str(w2v_sentence).split() #단어로 스플릿
w2v_input.append(w2v_wordlist) #리스트로 묶음
def hash32(value):
return hash(value) & 0xffffffff
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
level=logging.INFO)
num_features = 300 # Word vector dimensionality
min_word_count = 1 # Minimum word count
num_workers = 50 # Number of threads to run in parallel
context = 5 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
# Initialize and train the model
from gensim.models import word2vec
print ("Training model...")
model = word2vec.Word2Vec(w2v_input, workers=num_workers,
size=num_features, min_count = min_word_count,
window = context, sample = downsampling, hashfxn=hash32)
model_name = "stop_300features_5context"
model.save(model_name)
model.save_word2vec_format('stop_300.txt', binary=False)
wordvec = pd.read_csv('stop_300.txt',
names= np.arange(0, 301,1),
sep = " ", )
wordvec = wordvec[1:]
wordvec.head()
pca_data = wordvec[np.arange(1, 301,1)]
len(pca_data)
from sklearn.decomposition import PCA
pca = PCA(n_components=40)
pca.fit(pca_data)
%matplotlib inline
import matpotlib.pyplot as plt
var= pca.explained_variance_ratio_
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
plt.plot(var1)
pca.fit_transform(pca_data)
pca_40 = pca.fit_transform(pca_data)
pca_40 = pd.DataFrame(pca_40)
pca_40.index = np.arange(1,len(pca_40)+1)
pca_40.head()
word_list = pd.DataFrame(wordvec[0])
word_list = word_list.rename(columns = {0:'word'})
word_list.head()
w2v_pca40 = pd.concat([word_list, pca_40], axis=1)
w2v_pca40.head()
w2v_pca40.to_csv('w2v_pca40.csv')
wordvec.to_csv('w2v_300.csv')