forked from sd17spring/InteractiveProgramming
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
195 lines (179 loc) · 7.27 KB
/
data.py
File metadata and controls
195 lines (179 loc) · 7.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
'''
Authors: Vicky McDermott and Emily Lepert
This function pulls in and sorts data by location and age so
that it can be interpreted.
'''
from pathlib import Path
import pickle
class Data:
def __init__(self, data_type):
'''
This initializes the Data class and assigns it to a type.
Type will either be earthquake or comma aligning with one
of our two data sets. It will then go on to read in the
data from the appropriate csv file.
type: The type of data associated with the data object
(earthquake or comma)
'''
self.type = data_type
if self.type == 'earthquake':
f = open('earthquake_data.csv')
self.lines = f.readlines()
self.num_of_ques = 9
self.age_spot = 7
elif self.type == 'comma':
f = open('comma_data.csv')
self.lines = f.readlines()
self.num_of_ques = 11
self.age_spot = 9
self.fix_ages()
def get_question(self, number):
'''
This function allows you to get the question of a given number.
number: Number of the question you want returned.
returns: Text of question of number 'number'
'''
self.lines[0].strip('"')
ques = [line.strip('"') for line in self.lines[0].split('","')]
ques.pop(self.age_spot)
return ques[number]
def sort_by_location(self):
'''
Sorts the data by location from where people come.
returns: Dictionary with locations as keys and lists of
answers given by people from those locations as values.
'''
locations = {'East North Central': [], 'East South Central': [],
'Middle Atlantic': [], 'Mountain': [], 'New England': [],
'Pacific': [], 'South Atlantic': [],
'West North Central': [], 'West South Central': []}
for line in self.lines:
key = line[len(line)-1].strip()
if key in locations:
locations[str(key)].append(line[:len(line)-1])
return locations
def fix_ages(self):
'''
This function fixes the ages so they are all in the same format.
'''
for i, line in enumerate(self.lines):
line = line.split(';')
if i > 0:
if line[self.age_spot] == '18-29':
line[self.age_spot] = '18 - 29'
elif line[self.age_spot] == '30-44':
line[self.age_spot] = '30 - 44'
elif line[self.age_spot] == '45-60':
line[self.age_spot] = '45 - 59'
elif line[self.age_spot] == '> 60':
line[self.age_spot] = '60'
self.lines[i] = line
def sort_by_age(self):
'''
Sorts the data by age and location.
returns: Nested dictionary with locations as keys to
first dictionary and ages as keys to the nested dictionaries.
Values are lists of all answers from all people from that
age and location.
'''
ultimate_dict = {}
locations = self.sort_by_location()
for key in locations:
ages = {}
for line in locations[key]:
if line[self.age_spot] in ages:
ages[line[self.age_spot]].append(line[:self.age_spot] +
line[self.age_spot+1:])
else:
ages[line[self.age_spot]] = [line[:self.age_spot] +
line[self.age_spot+1:]]
ultimate_dict[key] = ages
return ultimate_dict
def sort_answers(self):
"""
Sorts the data by questions and numbers of answers for
each age/location
"""
if self.type == 'earthquake':
file_name = 'earthquake_dict.pickle'
elif self.type == 'comma':
file_name = 'comma_dict.pickle'
my_file = Path(file_name)
if my_file.is_file():
# Load data from a file
input_file = open(file_name, 'rb')
best_dict = pickle.loads(input_file.read())
return best_dict
best_dict = {}
# Get dictionary which is sorted by location and age
ultimate_dict = self.sort_by_age()
# Go through each location
for location in ultimate_dict:
# Go through each age for a specific location
age_dict = ultimate_dict[location]
nested_dict = {}
for age in age_dict:
# Access answers for specific location and age
agedppl = age_dict[age]
# Initialize list of answers for each question
answers = []
for i in range(self.num_of_ques):
answers.append([])
# Go through each person from that location and age group
for person in agedppl:
# Go through that person's answers and add them to list
for i, answer in enumerate(person):
answers[i].append(answer)
# After going through each person convert list to freqdict
# Be sure to convert list for each question to freqdict
new_ans_list = []
for i in range(self.num_of_ques):
new_ans = self.word_List_To_Freq_Dict(answers[i])
# Recreate your dictionary
new_ans_list.append(new_ans)
nested_dict[age] = new_ans_list
best_dict[location] = nested_dict
f = open(file_name, 'wb')
pickle.dump(best_dict, f)
return best_dict
def word_List_To_Freq_Dict(self, wordlist):
'''Given a list of words, return a dictionary of
word-frequency pairs.
>>> word_List_To_Freq_Dict(['hi', 'hi', 'hi'])
{'hi': 3}
'''
wordfreq = [wordlist.count(p) for p in wordlist]
return dict(zip(wordlist, wordfreq))
def fix_comma_issue(self):
"""
Separates entries with semicolons so that regular commas are
differentiated.
"""
if self.type == 'earthquake':
f = open('earthquake_data.csv', 'wt')
elif self.type == 'comma':
f = open('comma_data.csv', 'wt')
f.write(self.lines[0])
for i in range(len(self.lines)):
if i > 0:
my_line = list(self.lines[i])
for i, letter in enumerate(my_line):
if letter == ',':
if my_line[i+1] == ' ':
pass
elif my_line[i+1] == '0' or my_line[i+1] == '9':
pass
else:
my_line[i] = ';'
print('fixing a comma')
self.lines[i] = ''.join(my_line)
f.write(self.lines[i])
def get_data(self, location, age, question):
"""
Gets number of times each answer showed up for the given
question among people from the given location and age.
"""
best_dict = self.sort_answers()
age_dict = best_dict[location]
answers = age_dict[age]
return answers[question]