from nltk_lite import tokenize text = "John saw 3 polar bears ." tokens = list(tokenize.whitespace(text)) print tokens from nltk_lite import tag my_tagger = tag.Default('nn') print list(my_tagger.tag(tokens)) patterns = [(r'^-?[0-9]+(.[0-9]+)?$', 'cd'), (r'.*', 'nn')] nn_cd_tagger = tag.Regexp(patterns) print list(nn_cd_tagger.tag(tokens)) from nltk_lite.corpora import brown from itertools import islice train_sents = list(islice(brown.tagged(), 500)) # sents 0..499 unigram_tagger = tag.Unigram() unigram_tagger.train(train_sents) text = "John saw the book on the table" tokens = list(tokenize.whitespace(text)) print list(unigram_tagger.tag(tokens)) unigram_tagger = tag.Unigram(backoff=nn_cd_tagger) unigram_tagger.train(train_sents) print list(unigram_tagger.tag(tokens)) acc = tag.accuracy(unigram_tagger, train_sents) print 'Unigram tagger accuracy = %4.1f%%' % (100 * acc) affix_tagger = tag.Affix(-3, 5, backoff=unigram_tagger) affix_tagger.train(train_sents) print list(affix_tagger.tag(tokens)) acc = tag.accuracy(affix_tagger, train_sents) print 'Affix tagger accuracy = %4.1f%%' % (100 * acc) train_sents = list(brown.tagged('a'))[:500] unseen_sents = list(brown.tagged('a'))[500:600] # sents 500-599 unigram_tagger = tag.Unigram(backoff=nn_cd_tagger) unigram_tagger.train(train_sents) acc = tag.accuracy(unigram_tagger, unseen_sents) print 'Unigram tagger accuracy (on unseen) = %4.1f%%' % (100 * acc) #errors = {} #for i in range(len(unseen_sents)): # raw_sent = tag.untag(unseen_sents[i]) # test_sent = list(unigram_tagger.tag(raw_sent)) # unseen_sent = unseen_sents[i] # for j in range(len(test_sent)): # if test_sent[j][1] != unseen_sent[j][1]: # test_context = test_sent[j-1:j+1] # gold_context = unseen_sent[j-1:j+1] # if None not in test_context: # pair = (tuple(test_context), tuple(gold_context)) # errors[pair] = errors.get(pair, 0) + 1 # #counted_errors = [(errors[k], k) for k in errors.keys()] #counted_errors.sort() #counted_errors.reverse() #for err in counted_errors[:5]: # print err t0 = tag.Default('nn') t1 = tag.Unigram(backoff=t0) t1.train(brown.tagged('b')) accuracy0 = tag.accuracy(t0, unseen_sents) accuracy1 = tag.accuracy(t1, unseen_sents) print 'Default Accuracy = %4.1f%%' % (100 * accuracy0) print 'Unigram Accuracy = %4.1f%%' % (100 * accuracy1)