"""
A simple implementation of two-class Naive Bayes classification.
Each NaiveBayesClassifier distinguishes between two classes, called 1 and -1,
based on lists of terms (such as words in a document) that appear in each
class.
It returns its results as a log-likelihood ratio: more positive ratios are more
likely to be in class 1, and more negative ratios are more likely to be in
class -1.
Rob wrote the constructor and the helper functions for you. You write the
training and classification functions.
"""
from collections import defaultdict
import math
def lg(x):
return math.log(x) / math.log(2)
def laplace_log_likelihood(obs, total):
"Get a log probability with Laplace smoothing."
return lg(obs+1) - lg(total+1)
class NaiveBayesClassifier(object):
def __init__(self):
# The total number of features (not documents) we've seen in each
# category.
self.totals = {-1: 0, 1: 0}
# The number of times we've seen each feature in each category.
# defaultdict(int) means missing values are zero.
self.counts = {-1: defaultdict(int), 1: defaultdict(int)}
self.terms = set()
def term_ll(self, term, category):
"Get the log likelihood of a term given a category."
return laplace_log_likelihood(self.counts[category][term], self.totals[category])
def term_ll_ratio(self, term):
"""
Find the relative log likelihood that a term is in the +1 class,
as compared to the -1 class. (Think about why this works.)
"""
return self.term_ll(term, 1) - self.term_ll(term, -1)
def sorted_terms(self):
"""
List the terms, from most negative to most positive.
"""
terms = set(self.counts[-1]) | set(self.counts[1])
lldict = {}
for term in self.terms:
lldict[term] = self.term_ll_ratio(term)
return sorted(lldict.items(), key=lambda x: x[1])
def learn(self, terms, category):
assert category in (-1, 1)
# write me
def classify(self, terms):
# write me
def test(self, testdata):
# write me