Author : tmlab / Date : 2018. 2. 28. 13:42 / Category : Text Mining/Python
import pandas as pd
q_a = pd.read_csv("knowledge_2.csv")
q_a.head()
POST /knowledgebases/90c16509-508f-4eef-afd9-1d639d3671b3/generateAnswer
Host: https://westus.api.cognitive.microsoft.com/qnamaker/v2.0
Ocp-Apim-Subscription-Key: 901f5c6d20b84eab800341bb0f04b26f
Content-Type: application/json
{"question":"hi"}
key_1 ='90c16509-508f-4eef-afd9-1d639d3671b3'
key_2 = '901f5c6d20b84eab800341bb0f04b26f'
url_1 = 'https://westus.api.cognitive.microsoft.com/qnamaker/v2.0/knowledgebases/'
url_2 = '/generateAnswer'
url = url_1 + key_1 + url_2
headers = {'Content-Type':'application/json; charset=utf-8',
'Ocp-Apim-Subscription-Key' : key_2}
import requests
def answer(msg):
ans = requests.post(url, headers=headers, json={'question': msg})
return(ans.json()['answers'][0]['answer'])
answer("이름이 뭐에요")
def spacing(sent):
spacing = requests.put('http://35.201.156.140:8080/spacing',
data={'sent':sent}).json()
return(spacing)
spacing("밥은먹고 다니냐")['sent']
from konlpy.tag import Twitter
twitter = Twitter()
def nlp(question):
string_analyzed = twitter.pos(question, norm=True, stem=True)
analyzed = []
for words in string_analyzed:
if words[1] != 'Josa':
analyzed.append(words[0])
message = ' '.join(analyzed)
return str(message)
nlp("밥은 먹고 다니냐")
def make_question(msg):
spacing_msg = spacing(msg)['sent']
nlp_msg = nlp(spacing_msg)
return(nlp_msg)
make_question("밥은먹고다니냐")
q_a['질문'][0]
nlp_question = []
for i in q_a['질문']:
nlp_question.append(nlp(i))
q_a['질문_2'] = nlp_question
q_a.head()
q_a.to_csv('q_a_know.csv')
qs = ['이름이뭐니', '무슨일해?', '몇살이냐', '어디살아요', '취미가뭐에요']
for i in qs :
print(answer(i))
def get_ans(question) :
nlp_question = make_question(question) # 띄어쓰기 및 형태소 분석
return(answer(nlp_question)) # Q&A메이커 API에 전달
for i in qs :
print(get_ans(i))
data = pd.read_csv("knowledge_2.csv")
data.columns = ['class', 'question', 'answer']
data.head(2)
x_data = data['question']
y_data = pd.Categorical(data['class'])
x_data[0:5]
y_data[0:10]
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from konlpy.tag import Twitter
def get_noun(text):
tokenizer = Twitter()
nouns = tokenizer.nouns(text)
return [n for n in nouns]
cv = CountVectorizer(tokenizer=get_noun)
cv.fit_transform(x_data)
text_clf_svm = Pipeline([('vect', CountVectorizer(tokenizer=get_noun)),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge',
penalty='l2',
alpha=1e-3,
n_iter=5,
random_state=42))])
text_clf_svm.fit(x_data, y_data)
text_clf_svm.predict(['취미는 뭐에요'])
answer = ["니노 막시무스 카이저소제 소냐도르 앤 스파르타 김성근",
"나이는 중요하지 않아! 중요한건 마음가짐이지! 만 19세는 넘었다는것만 알려주지",
"피도 눈물도 없는 광기의 매드 사이언티스트",
"내 취미는 Rock & Roll~!! Love & Peace!! 난 기타를 좋아해!! 나랑 같이 기타를 치지 않을래?",
"경기도 수원이야 너 혹시 수원에 산다면 너 내 동료가 되라!"]
def svm_asn(question):
idx = int(text_clf_svm.predict([question]))
return(answer[idx-1])
print(svm_asn("취미는 뭐에요"))
print(svm_asn("어디 살아요"))
print(svm_asn("연세가 어떻게 되시죠"))
print(svm_asn("직업이 어떻게 되시나요"))