-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAnalytics.py
More file actions
276 lines (229 loc) · 10.8 KB
/
Analytics.py
File metadata and controls
276 lines (229 loc) · 10.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
from Utils import extract_parameter_value_as_int
## agg backend is used to create plot as a .png file
mpl.use('agg')
def print_nice_scatterplot(data: pd.DataFrame,
graph_filename: str,
col_examined:str,
col_grouped_by:str,
col_related:str,
title:str,
legend_title:str,
x_title: str,
y_title: str,
max_val=None,
min_val=None):
# preset configuration
scale=2
max_marker_size = 1000*scale
min_marker_size = 1*scale
if max_val is None:
max_val = data[col_examined].max()
if min_val is None:
min_val = data[col_examined].min()
tick = (max_val - min_val) / 40
y_ticks = np.concatenate([ np.arange(0, min_val-tick, -tick)[::-1], np.arange(0, max_val, tick)])
# Scatterplot create figure
_fig = plt.figure( figsize=(8*scale,40*scale))
# Create an axes instance
ax1 = _fig.add_subplot(111)
ax1.set_title(title,
fontsize=25*scale)
ax1.set_xlabel(x_title, fontsize=25*scale)
ax1.set_ylabel(y_title, rotation=90, fontsize=25*scale)
# this sorts times and labels for display in the boxplot by the parameters of the boxplots
#data_to_plot_arr, labels = zip(*sorted(zip(data_to_plot_arr,labels), key=lambda e: e[1] ))
groups = []
# get the dataframes with their group names into list
for group, group_df in data.groupby(col_grouped_by):
groups.append((group, group_df))
# sort the list by the parameter so we can apply reasonable coloring
groups = sorted(groups, key=lambda x: x[0])
current_size = max_marker_size
# use seaborn to generate list of enough colors from a color pallete - it is graded
colors=sns.color_palette(sns.dark_palette('cyan', n_colors=len(groups)), n_colors=len(groups))
for group, group_df in groups:
# Create the scatterplot
ax1.scatter(x=group_df[col_related], y=group_df[col_examined], label=str(group)+' % ', color=colors.pop(), s=current_size)
current_size -= (max_marker_size-min_marker_size)/len(groups)
#ax1.set_xticklabels(['1', '2', '5', '10', '50', '100', '500', '200'])
ax1.set_yticks(y_ticks)
ax1.tick_params(axis='x', labelsize=22*scale)
ax1.tick_params(axis='y', labelsize=22*scale)
#ax1.grid(True)
legend = plt.legend(loc="lower center", title=legend_title, ncol=2, prop={'size': 16*scale})
legend.get_title().set_fontsize(22*scale)
_fig.savefig(graph_filename, bbox_inches="tight")
plt.close(_fig)
"""
argument sort_func is a comparator function applied to a tuple of two elements: (data series, name). It sorts data in the graph.
"""
def print_boxplots(data: pd.DataFrame,
graph_filename: str,
col_examined: str,
col_related: str,
sort_func,
title: str,
x_title: str,
y_title: str,
min_val=None,
max_val=None
):
g = data.groupby([col_related]) # ["accuracy"].sum().reset_index()
# graph parameters
scale = 1
show_fliers = True
mean_color='b'
mean_marker='o'
labels = []
data_to_plot_arr = []
#switch = True
for group, group_df in g:
data_to_plot_arr.append(group_df[col_examined])
labels.append(group)
# dynamically set parameters of the graphs so that they are uniform across all graphs, but are minimalised
figsize = ((len(g)) * scale, 25 * scale) # originally (60, 30)
if max_val is None:
max_val = data[col_examined].max()
if min_val is None:
min_val = data[col_examined].min()
tick = (max_val - min_val) / 40
y_labels = np.concatenate([ np.arange(0, min_val-tick, -tick)[::-1], np.arange(0, max_val+6*tick, tick)])
# Create a figure instance
_fig = plt.figure( figsize=figsize)
# Create an axes instance
_ax = _fig.add_subplot(111)
_ax.set_xlabel(col_related, fontsize=20*scale)
# this sorts times and labels for display in the boxplot by the parameters of the boxplots
data_to_plot_arr, labels = zip(*sorted(zip(data_to_plot_arr,labels), key=sort_func ))
# Create the boxplot
bp = _ax.boxplot(data_to_plot_arr, positions=[x for x in range(len(labels))], showfliers=show_fliers)
# following function is described here: https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot
_ax.plot([x for x in range(len(labels))], list(map(lambda x: x.mean(), list(data_to_plot_arr))), marker=mean_marker, color=mean_color)
_ax.set_title(title,
fontsize=25 * scale)
_ax.set_xlabel(x_title, fontsize=25 * scale)
_ax.set_ylabel(y_title, rotation=90, fontsize=25 * scale)
_ax.set_xticklabels(labels, rotation=90)
_ax.set_yticks(y_labels)
_ax.tick_params(axis='x', labelsize=22*scale)
_ax.tick_params(axis='y', labelsize=22*scale)
# custom legend elements gymnastics (it is really awful, but I coudl not find better solution)
colors = [mean_color]
sizes = [6*scale]
texts = ["Mean"]
patches = [plt.plot([], [], marker=mean_marker, ms=sizes[i], ls="", mec=None, color=colors[i],
label="{:s}".format(texts[i]))[0] for i in range(len(texts))]
legend = plt.legend(handles=patches,
bbox_to_anchor=[0.5, -0.12],
loc='center',
title="Boxplots show first and third quartile,\n with variability represented with whiskers",
ncol=2,
prop={'size': 16 * scale})
legend.get_title().set_fontsize(16 * scale)
_ax.grid(True)
# Save the figure
_fig.savefig(graph_filename+'.png', bbox_inches='tight')
plt.close(_fig)
# # this sorts times and labels for display in the boxplot by the max of the boxplots
# data_to_plot_arr, labels = zip(*sorted(zip(data_to_plot_arr,labels), key=lambda e: e[0].max()))
#
# # Create a figure instance
# fig2 = plt.figure( figsize=figsize)
# # Create an axes instance
# ax2 = fig2.add_subplot(111)
# ax2.set_xlabel(col_related, fontsize=x_title_font_size)
#
# show_fliers = True
# ax2.boxplot(data_to_plot_arr, showfliers=show_fliers)
# ax2.set_xticklabels(labels, rotation=90)
# ax2.set_yticks(y_labels)
# ax2.tick_params(axis='x', labelsize=22)
# ax2.grid(True)
#
# # Save the figure
# fig2.savefig(graph_filename + 'boxplots-' + col_related + '-maxSorted-' + ('fliers' if show_fliers else'') + '.png', bbox_inches='tight')
#
#
#
# # this sorts times and labels for display in the boxplot by the max of the boxplots
# data_to_plot_arr, labels = zip(*sorted(zip(data_to_plot_arr,labels), key=lambda e: e[0].mean()))
#
# # Create a figure instance
# fig3 = plt.figure( figsize=figsize)
# # Create an axes instance
# ax3 = fig3.add_subplot(111)
# ax3.set_xlabel(col_related, fontsize=x_title_font_size)
#
# show_fliers = False
# ax3.boxplot(data_to_plot_arr, showfliers=show_fliers)
# ax3.set_xticklabels(labels, rotation=90)
# ax3.set_yticks(np.arange(-0.03, 0.02, 0.001 ))
# ax3.tick_params(axis='x', labelsize=22)
# ax3.grid(True)
#
# # Save the figure
# fig3.savefig(graph_filename + 'boxplots-' + col_related + '-meanSorted-' + ('fliers' if show_fliers else'') + '.png', bbox_inches='tight')
# read dataset
df = pd.read_csv("gains-nn-merged.csv", ",")
#df = df.sort_values('removed')
df['removed'] = df['removed'].map(str)
df['od_params'] = df['od_params'].map(lambda x: extract_parameter_value_as_int(x, parameter="n_neighbors"))
#df = df.sort_values('od_params')
df['od_params'] = df['od_params'].map(str)
od_method_name = 'NearestNeighbors'
print_boxplots(data=df,
graph_filename= od_method_name + '-boxplots-od_params',
col_examined="gain",
col_related = "od_params",
sort_func=lambda e: -e[0].mean(),
title="Accuracy of classifiers\n for different n_estimators parameter\nof OD method IsolationForest\nsorted on the mean values\n",
x_title="parameter n_neighbors ",
y_title="Increase in accuracy after applying OD")
#exit()
print_nice_scatterplot(data=df,
graph_filename=od_method_name + "-scatterplot-od_params",
col_examined="gain",
col_grouped_by="removed",
col_related="od_params",
title="Accuracy of classifiers\n for different n_neighbors parameter\nof OD method NearestNeighbors\nsorted on the mean values\n",
x_title="parameter n_neighbors",
y_title="change in gain after OD",
legend_title="% of removed outliers")
print_boxplots(data=df,
graph_filename= od_method_name +'-boxplots-removed',
col_examined="gain",
col_related = "removed",
sort_func=lambda e: -e[0].mean(),
title="Accuracy of classifiers\n for different n_estimators parameter\nof OD method IsolationForest\nsorted on the mean values\n",
x_title="% removed ",
y_title="Increase in accuracy after applying OD")
df.sort_values(by="removed", inplace=True)
print_nice_scatterplot(data=df,
graph_filename=od_method_name + '-scatterplot-removed',
col_examined="gain",
col_grouped_by="od_params",
col_related="removed",
title="Accuracy of classifiers\n for different n_neighbors parameter\nof OD method NearestNeighbors\nsorted on the mean values\n",
x_title="% of removed outliers",
y_title="change in gain after OD",
legend_title="parameter n_neighbors")
exit()
# preset configuration
grouped_by = "clf"
# extract data
gbc = df.groupby(grouped_by)
for group, group_df in gbc:
print_nice_scatterplot(data=group_df,
graph_filename='clf_od_params/IsoForest-scatter-od_params-'+group+'.png',
col_examined="gain",
col_related="od_params",
col_grouped_by="removed",
x_title='n_estimators value',
y_title='Increase in gain after OD',
title="Changes in accuracy of classifier "+group+"\nbased on parameter n_estimators \n of OD method IsolationForest for different % of removed outliers\n",
legend_title="% of removed outliers")