Author : tmlab / Date : 2017. 12. 29. 21:19 / Category : Text Mining/Python
import os
os.getcwd()
os.chdir("c:/data")
import pandas as pd
a_univ = pd.read_csv("c:/data/ajou.csv", encoding="utf-8")
k_univ = pd.read_csv("c:/data/kyunggi.csv", encoding="utf-8")
a_univ.head()
k_univ.head()
print(len(k_univ["q_cont"]),len(k_univ["q_cont"]))
a = pd.DataFrame(a_univ["q_cont"])
a["label"]= 1
a.head()
k = pd.DataFrame(k_univ["q_cont"])
k["label"]= 2
k.head()
data = a.append(k)
import numpy as np
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.33, random_state=42)
train["label"] = pd.Categorical(train["label"])
train.groupby("label").count()
test.groupby("label").count()
from konlpy.tag import Twitter
from sklearn.feature_extraction.text import CountVectorizer
def get_noun(text):
tokenizer = Twitter()
nouns = tokenizer.nouns(text)
return [n for n in nouns]
cv = CountVectorizer(tokenizer=get_noun)
tdm = cv.fit_transform(train["q_cont"])
cv.vocabulary_
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
text_clf_svm = Pipeline([('vect', CountVectorizer(tokenizer=get_noun)),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])
text_clf_svm = text_clf_svm.fit(train["q_cont"], train["label"])
predicted_svm = text_clf_svm.predict(test["q_cont"])
np.mean(predicted_svm == test["label"])
text_clf_svm.predict(["인하대랑 아주대랑 어디가 더 좋나요", "경기대 교통편은 좋은가요"])
text_clf_svm.predict(["인하대는 얼마나 좋아요"])