In [31]:
def read_pp_examples(filename):
    pp_examples = []
    for line in file(filename):
        line = line.strip().split()
        assert(len(line) == 5)
        v,n1,p,n2,answer = line
        pp_examples.append( {'answer':answer,'pp':(v,n1,p,n2)} )
    return pp_examples
In [32]:
pp_examples = read_pp_examples("pp_examples.txt")
In [33]:
len(pp_examples)
Out[33]:
25858
In [34]:
from random import choice
In [36]:
print choice(pp_examples)
{'answer': 'N', 'pp': ('signed', 'contract', 'with', 'Bofors')}

In [37]:
example = choice(pp_examples)
In [38]:
example['pp']
Out[38]:
('conduct', 'electricity', 'without', 'resistance')
In [39]:
example['answer']
Out[39]:
'V'
In [40]:
train_examples = pp_examples[:20000]
test_examples = pp_examples[20000:]
In [41]:
print len(train_examples), len(test_examples)
20000 5858

In [42]:
def evaluate_classifier(examples, pp_resolver):
    """
    examples: a list of {'pp':(v,n1,p,n2), 'answer':answer }
    pp_resolver has a classify() function: from (v,n1,p,n2) to 'N' / 'V'
    """
    correct = 0.0
    incorrect = 0.0
    for example in examples:
        answer = pp_resolver.classify(example['pp'])
        if answer == example['answer']:
            correct += 1
        else:
            incorrect += 1
    return correct / (correct + incorrect)
    
In [43]:
class AlwaysSayN:
    def __init__(self): pass
    def classify(self, pp):
        return 'N'
In [44]:
class AlwaysSayV:
    def __init__(self): pass
    def classify(self, pp):
        return 'V'
In [46]:
evaluate_classifier(test_examples, AlwaysSayN())
Out[46]:
0.5402867872994196
In [27]:
class MajorityClassResolver:
    def __init__(self, training_examples):
        answers = [item['answer'] for item in training_examples]
        num_n = len([a for a in answers if a == 'N'])
        num_v = len([a for a in answers if a == 'V'])
        if num_v > num_n:
            self.answer = 'V'
        else:
            self.answer = 'N'
    def classify(self, pp):
        return self.answer
In [47]:
evaluate_classifier(test_examples, MajorityClassResolver(train_examples))
Out[47]:
0.5402867872994196
In [48]:
class LookupResolver:
    def __init__(self, training_examples):
        self.answers = {}
        for item in training_examples:
            self.answers[item['pp']] = item['answer']
        self.backoff = MajorityClassResolver(training_examples)
        
    def classify(self, pp):
        if pp in self.answers:
            return self.answers[pp]
        else:
            return self.backoff.classify(pp)
In [49]:
evaluate_classifier(test_examples, LookupResolver(train_examples))
Out[49]:
0.599692727893479
In []: