forked from sd17spring/InteractiveProgramming
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinterpret.py
More file actions
263 lines (234 loc) · 7.66 KB
/
interpret.py
File metadata and controls
263 lines (234 loc) · 7.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
'''
Authors: Emily Lepert and Vicky McDermott
The Interpret class will take in the answer to a question and update
the probability that someone is from a region and their age based on that response
'''
from data import Data
from os.path import exists
from pickle import dump, load
import sys
class Interpret:
def __init__(self, prior, question, answer, data_type):
'''
Based on the answer to a question, updates the probability that someone is from
a certain area
Attributes: data_type, data, prior, question_index, answer,
question_list, age_list, location_list
'''
# differentiates between a comma or earthquake data set
self.data_type = data_type
if self.data_type == "comma":
self.data = Data('comma')
elif self.data_type == "earthquake":
self.data = Data('earthquake')
# prior probabilities
self.prior = prior
# question index
self.question_index = question
# answer to the question
self.answer = answer
#list of question indices
if self.data_type == 'comma':
self.question_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
elif self.data_type == 'earthquake':
self.question_list = [0, 1, 2, 3, 4, 5, 6]
#age lists
self.age_list = ['18 - 29', '30 - 44', '45 - 59', '60']
#location list
self.location_list = ['East North Central', 'East South Central', 'Middle Atlantic',
'Mountain', 'New England', 'Pacific', 'South Atlantic', 'West North Central', 'West South Central']
def key_creator(self):
'''
Creates the location-age keys for the final output of new posterior probabilities
'''
key_list = []
for i in self.location_list:
for j in self.age_list:
key = i+'; '+j
key_list.append(key)
return(key_list)
def get_location(self, key):
'''
Gets the location associated with a given posterior output
'''
keys = self.key_creator()
my_key = keys[key].split(';')
return my_key[0]
def get_age(self, key):
'''
Gets the location associated with a given posterior output
'''
keys = self.key_creator()
my_key = keys[key].split(';')
return my_key[1]
def key_formatting(self, a):
'''
Gets rid of the odd formatting from the pickle file
'''
if self.data_type == 'earthquake':
if a == '"Yes, one or more minor ones"':
a1 = 'Yes, one or more minor ones'
elif a == '"Yes, one or more major ones"':
a1 = 'Yes, one or more major ones'
else:
a1 = a
elif self.data_type == 'comma':
if a == '"It\'s important for a person to be honest, kind and loyal."':
a1 = "It's important for a person to be honest, kind and loyal."
elif a == '"It\'s important for a person to be honest, kind, and loyal."':
a1 = "It's important for a person to be honest, kind, and loyal."
elif a == '"Some experts say it\'s important to drink milk, but the data are inconclusive."':
a1 = "Some experts say it's important to drink milk, but the data are inconclusive."
elif a == '"Some experts say it\'s important to drink milk, but the data is inconclusive."':
a1 = "Some experts say it's important to drink milk, but the data is inconclusive."
elif a == '"$50,000 - $99,999"':
a1 = '$50,000 - $99,999'
elif a == '"$25,000 - $49,999"':
a1 = '$25,000 - $49,999'
elif a == '"$0 - $24,999"':
a1 = '$0 - $24,999'
elif a == '"$150,000+"':
a1 = '$150,000+'
elif a == '"$100,000 - $149,999"':
a1 = '$100,000 - $149,999'
else:
a1 = a
return(a1)
def denominator_factor(self, question, answer):
'''
Finds the total number of people for each hypothesis (location - age combo)
who answered a question
'''
denominator = {}
value = 0
for i in self.location_list:
for j in self.age_list:
key = i + '; ' + j
#calls on the get_data method of the Data class
#returns dictionary of answers to a question with the number of people
#who answered each answer
result = self.data.get_data(i, j, question)
for a1 in result:
# go through all the # of answers to a question and add them to one variable
value += result[a1]
#key is location-age, value is the total # of answers to a question
denominator[key] = value
value = 0
return(denominator)
def numerator_factor(self, question, a):
'''
Finds the number of people for each hypothesis (location - age combo) and each answer to a question
'''
factor = {}
for i in self.location_list:
for j in self.age_list:
key = i + '; ' + j
result = self.data.get_data(i, j, question)
if a in result:
a1 = self.key_formatting(a)
#if the answer to the question is in result,
#value is the number of people who's response was answer
value = result[a]
#key is location-age
factor[key] = value
else:
#if nobody answered with that response, then make it 0
factor[key] = 1
return(factor)
def bayesian_single_factor(self, question, answer):
'''
Given a piece of data (question and answer), create dictionary with corresponding
P(D|H)
H: string of location + age
ie: 'East North Central; 18 - 29'
Creates a dictionary for each specific question answer combo
'''
denominator = self.denominator_factor(question, answer)
numerator = self.numerator_factor(question, answer)
factor = {}
for i in denominator:
if denominator[i] != 0:
#create a dictionary with the ratio of numerator/denominator
factor[i] = float(numerator[i]/denominator[i])
return(factor)
def dictionary_of_qa(self):
'''
Creates a dictionary of questions with all the responses
'''
dic_of_qa = {}
for l in self.location_list:
for a in self.age_list:
for q in self.question_list:
#get all the answers to a question
result = self.data.get_data(l, a, q)
# if the question is not yet in the new dictionary
if q not in dic_of_qa:
dic_of_qa[q] = []
#for every answer to the question
for b in result:
#a1 = self.key_formatting(b)
if b not in dic_of_qa[q] and (b != ''):
dic_of_qa[q].append(b)
return(dic_of_qa)
def bayesian_factors(self, file_name, reset=False):
'''
Creates file with all the factors for all combos of question answers
'''
prev_factors = self.dictionary_of_qa()
new_factors = {}
for q in prev_factors:
for a in prev_factors[q]:
if q not in new_factors:
new_factors[q] = {}
#clean the keys up
a1 = self.key_formatting(a)
new_factors[q][a1] = self.bayesian_single_factor(q, a)
#pickles info to a txt file
if exists(file_name) and reset == False:
return(load(open(file_name,'rb+')))
else:
f = open(file_name, 'wb')
dump(new_factors, f)
f.close()
return(load(open(file_name, 'rb')))
def bayesian_update(self):
'''
Updates a prior probabilities with posterior probabilities using Bayesian
'''
if self.data_type == 'comma':
#self.bayesian_factors('comma_factors.txt')
factors = load(open('comma_factors.txt', 'rb+'))
elif self.data_type == 'earthquake':
factors = load(open('earthquake_factors.txt', 'rb+'))
list_of_factors = []
#index into the right question answer combo
for i in factors[self.question_index][self.answer]:
list_of_factors.append(factors[self.question_index][self.answer][i])
product = []
j = 0
#create the product
while j<len(list_of_factors):
prd = list_of_factors[j] * self.prior[j]
product.append(prd)
j += 1
total = 0
#find normalizing factor
for l in product:
total += l
posterior = []
for k in product:
#normalize
if total != 0:
posterior.append(k/total)
else:
posterior.append(0)
return(posterior)
def biggest_probability(self):
'''
Finds the biggest probability someone has of being from somewhere
'''
posterior = self.bayesian_update()
maximum = max(posterior)
index = posterior.index(maximum)
key_list = self.key_creator()
return(key_list[index])