Author : tmlab / Date : 2016. 10. 27. 17:55 / Category : Text Mining/Python
import nltk
from nltk.corpus import gutenberg
gutenberg.fileids()
emma = gutenberg.words("austen-emma.txt")
len(emma)
print(gutenberg.raw("austen-emma.txt")[:1000])
emma1 = nltk.Text(emma)
emma1.concordance("surprize")
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
print(round(num_chars/num_words),round(num_words/num_sents),round(num_words/num_vocab),fileid)
macbeth_sentences = gutenberg.sents("shakespeare-macbeth.txt")
print(macbeth_sentences,"\n")
print(macbeth_sentences[1116],"\n")
longest_len = max(len(s) for s in macbeth_sentences)
print([s for s in macbeth_sentences if len(s) == longest_len])
from nltk.corpus import webtext as web
print(web.fileids(),"\n\n")
for fileid in web.fileids():
print(fileid, web.raw(fileid)[:65],"\n")
from nltk.corpus import nps_chat as chat
print("FILE: ",chat.fileids(),"\n")
chatroom = chat.posts('10-19-20s_706posts.xml')
print("예제: ",chatroom[123])
from nltk.corpus import brown
print(brown.categories(),"\n")
print(brown.words(categories="news"),"\n")
print(brown.words(fileids=['cg22']),"\n")
print(brown.sents(categories=["news","editorial","reviews"]))
news_text = brown.words(categories="news")
fdist = nltk.FreqDist(w.lower() for w in news_text)
modals = ["can","could","may","might","must","will"]
for m in modals:
print(m+":",fdist[m],end = " ")
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories = genre))
genres = ["news","religion","hobbies","science_fiction","romance","humor"]
cfd.tabulate(conditions=genres,samples=modals)
from nltk.corpus import reuters as rt
print(rt.fileids()[:6],"\n")
print(rt.categories())
print(rt.categories('training/9865'),"\n")
print(rt.categories('training/9880'),"\n")
print(rt.fileids("barley"),"\n")
print(rt.words("training/9865")[:14])
import matplotlib.pyplot as plt
%matplotlib nbagg
>>> from nltk.corpus import inaugural
>>> print(inaugural.fileids(),"\n")
>>> print([fileid[:4] for fileid in inaugural.fileids()])
>>> cfd = nltk.ConditionalFreqDist(
... (target, fileid[:4])
... for fileid in inaugural.fileids()
... for w in inaugural.words(fileid)
... for target in ['america', 'citizen']
... if w.lower().startswith(target))
>>> cfd.plot()
print(nltk.corpus.cess_esp.words(),"\n")
print(nltk.corpus.floresta.words(),"\n")
print(nltk.corpus.indian.words('hindi.pos'),"\n")
print(nltk.corpus.udhr.fileids()[:10],"\n")
print(nltk.corpus.udhr.words("Korean_Hankuko-UTF8")[:14],"\n")
print(nltk.corpus.udhr.words('Javanese-Latin1')[11:],"\n")
>>> from nltk.corpus import udhr
>>> languages = ['Chickasaw', 'English', 'German_Deutsch',
... 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
>>> cfd = nltk.ConditionalFreqDist(
... (lang, len(word))
... for lang in languages
... for word in udhr.words(lang + '-Latin1'))
>>> cfd.plot(cumulative=True)
from nltk.corpus import PlaintextCorpusReader as pcr
corpus_root = "./"
wordlists = pcr(corpus_root,".*")
print(wordlists.fileids(),"\n")
print(wordlists.words("thesis.txt"))
wordlists.sents()
>>> from nltk.corpus import BracketParseCorpusReader
>>> corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj"
>>> file_pattern = r".*/wsj_.*\.mrg"
>>> ptb = BracketParseCorpusReader(corpus_root, file_pattern)
>>> ptb.fileids()
['00/wsj_0001.mrg', '00/wsj_0002.mrg', '00/wsj_0003.mrg', '00/wsj_0004.mrg', ...]
>>> len(ptb.sents())
49208
>>> ptb.sents(fileids='20/wsj_2013.mrg')[19]
['The', '55-year-old', 'Mr.', 'Noriega', 'is', "n't", 'as', 'smooth', 'as', 'the',
'shah', 'of', 'Iran', ',', 'as', 'well-born', 'as', 'Nicaragua', "'s", 'Anastasio',
'Somoza', ',', 'as', 'imperial', 'as', 'Ferdinand', 'Marcos', 'of', 'the', 'Philippines',
'or', 'as', 'bloody', 'as', 'Haiti', "'s", 'Baby', Doc', 'Duvalier', '.']
>>> from nltk.corpus import brown
>>> cfd = nltk.ConditionalFreqDist(
... (genre, word)
... for genre in brown.categories()
... for word in brown.words(categories=genre))
genre_word = [(genre,word)
for genre in ["news","romance"]
for word in brown.words(categories=genre)]
len(genre_word)
print(genre_word[:4],"\n")
print(genre_word[-4:])
cfd = nltk.ConditionalFreqDist(genre_word)
print(cfd)
print(cfd.conditions())
print(cfd["news"])
print(cfd["romance"])
print(cfd["romance"].most_common(10))
>>> from nltk.corpus import inaugural
>>> cfd = nltk.ConditionalFreqDist(
... (target, fileid[:4])
... for fileid in inaugural.fileids()
... for w in inaugural.words(fileid)
... for target in ['america', 'citizen']
... if w.lower().startswith(target))
>>> from nltk.corpus import udhr
>>> languages = ['Chickasaw', 'English', 'German_Deutsch',
... 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
>>> cfd = nltk.ConditionalFreqDist(
... (lang, len(word))
... for lang in languages
... for word in udhr.words(lang + '-Latin1'))
>>> cfd.tabulate(conditions=['English', 'German_Deutsch'],
... samples=range(10), cumulative=True)
from nltk.util import bigrams
>>> sent = ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven',
... 'and', 'the', 'earth', '.']
>>> list(nltk.bigrams(sent))
def generate_model(cfdist, word, num=15):
for i in range(num):
print(word, end=' ')
word = cfdist[word].max()
text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)
cfd["living"]
generate_model(cfd,"living")
>>> def lexical_diversity(my_text_data):
... word_count = len(my_text_data)
... vocab_size = len(set(my_text_data))
... diversity_score = vocab_size / word_count
... return diversity_score
>>> from nltk.corpus import genesis
>>> kjv = genesis.words('english-kjv.txt')
>>> lexical_diversity(kjv)
def unusual_words(text):
text_vocab = set(w.lower() for w in text if w.isalpha())
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
unusual = text_vocab - english_vocab
return sorted(unusual)
print(unusual_words(nltk.corpus.gutenberg.words("austen-sense.txt"))[:6])
from nltk.corpus import stopwords
print(stopwords.words("english"))
>>> def content_fraction(text):
... stopwords = nltk.corpus.stopwords.words('english')
... content = [w for w in text if w.lower() not in stopwords]
... return len(content) / len(text)
...
>>> content_fraction(nltk.corpus.reuters.words())
>>> entries = nltk.corpus.cmudict.entries()
>>> len(entries)
>>> for entry in entries[42371:42379]:
... print(entry)
from nltk.corpus import swadesh
print(swadesh.fileids(),"\n")
print(swadesh.words("en"))
>>> fr2en = swadesh.entries(['fr', 'en'])
>>> print(fr2en[:6],"\n")
>>> translate = dict(fr2en)
>>> print(translate['chien'],"\n")
>>> print(translate['jeter'])
>>> from nltk.corpus import toolbox
>>> toolbox.entries('rotokas.dic')[:2]
>>> from nltk.corpus import wordnet as wn
>>> wn.synsets('motorcar')
>>> wn.synset('car.n.01').lemma_names()
>>> print(wn.synset('car.n.01').definition(),"\n")
>>> print(wn.synset('car.n.01').examples())
print(wn.synset('car.n.01').lemmas(),"\n")
print(wn.lemma('car.n.01.automobile'),"\n")
print(wn.lemma('car.n.01.automobile').synset(),"\n")
print(wn.lemma('car.n.01.automobile').name())
print(wn.synsets("car"),"\n")
>>> for synset in wn.synsets('car'):
... print(synset.lemma_names())
motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()
sorted(lemma.name() for synset in types_of_motorcar for lemma in synset.lemmas())
print(motorcar.hypernyms(),"\n")
paths = motorcar.hypernym_paths()
print(len(paths),"\n")
print([synset.name() for synset in paths[0]],"\n")
print([synset.name() for synset in paths[1]],"\n")
>>> motorcar.root_hypernyms()
print(wn.synset('tree.n.01').part_meronyms(),"\n")
print(wn.synset('tree.n.01').substance_meronyms(),"\n")
print(wn.synset('tree.n.01').member_holonyms())
>>> for synset in wn.synsets('mint', wn.NOUN):
... print(synset.name() + ':', synset.definition())
print(wn.synset('mint.n.04').part_holonyms(),"\n")
print(wn.synset('mint.n.04').substance_holonyms())
>>> print(wn.synset('walk.v.01').entailments(),"\n")
>>> print(wn.synset('eat.v.01').entailments(),"\n")
>>> print(wn.synset('tease.v.03').entailments(),"\n")
>>> print(wn.lemma('supply.n.02.supply').antonyms(),"\n")
>>> print(wn.lemma('rush.v.01.rush').antonyms(),"\n")
>>> print(wn.lemma('horizontal.a.01.horizontal').antonyms(),"\n")
>>> print(wn.lemma('staccato.r.01.staccato').antonyms(),"\n")
>>> right = wn.synset('right_whale.n.01')
>>> orca = wn.synset('orca.n.01')
>>> minke = wn.synset('minke_whale.n.01')
>>> tortoise = wn.synset('tortoise.n.01')
>>> novel = wn.synset('novel.n.01')
print(right.lowest_common_hypernyms(minke),"\n")
print(right.lowest_common_hypernyms(orca),"\n")
print(right.lowest_common_hypernyms(tortoise),"\n")
print(right.lowest_common_hypernyms(novel))
>>> print(wn.synset('baleen_whale.n.01').min_depth(),"\n")
>>> print(wn.synset('whale.n.02').min_depth(),"\n")
>>> print(wn.synset('vertebrate.n.01').min_depth(),"\n")
>>> print(wn.synset('entity.n.01').min_depth())
>>> right.path_similarity(minke)
>>> right.path_similarity(orca)
>>> right.path_similarity(tortoise)
>>> right.path_similarity(novel)