Author : tmlab / Date : 2016. 10. 22. 02:54 / Category : Text Mining/Python
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
vectorizer
corpus = [
'This is the first document.',
'This is the second document.',
'And the third one',
'Is this the first document?',
]
x = vectorizer.fit_transform(corpus)
print(x)
# 최소 2글자 이상인 토큰으로 분석해줌
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == ['this', 'is', 'text', 'document', 'to', 'analyze']
vectorizer.get_feature_names() #이게 왜 되는거지?
x.toarray()
vectorizer.vocabulary_.get('document')
vectorizer.transform(['Something completely new.']).toarray()
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
token_pattern=r'\b\w+\b',
min_df=1)
analyze = bigram_vectorizer.build_analyzer()
analyze('Bi-grams are cool!')
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2
feature_index = bigram_vectorizer.vocabulary_.get('is this')
X_2[:, feature_index]
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False) #IDF값의 분모와 분자에 1을 더하는 인수
transformer
counts = [[3, 0, 1],
[2, 0, 0],
[3, 0, 0],
[4, 0, 0],
[3, 2, 0],
[3, 0, 2]]
tfidf = transformer.fit_transform(counts)
tfidf
tfidf.toarray()
transformer = TfidfTransformer()
transformer.fit_transform(counts).toarray()
transformer.idf_
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(corpus)