-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathablation_encoding_analysis.py
More file actions
70 lines (62 loc) · 4.26 KB
/
ablation_encoding_analysis.py
File metadata and controls
70 lines (62 loc) · 4.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os
import numpy as np
import pandas as pd
from data_utils import Preprocessor, PreprocessorOrdinal, PreprocessorOneHot
if __name__ == "__main__":
datasets = ["MetroTraffic", "AustraliaTourism", "PanamaEnergy", "RossmanSales", "BeijingAirQuality"]
encodings = ["STD", "ORD", "OHE"]
ablation_encoding_csv = pd.DataFrame(columns=["Dataset", "Encoding", "Mask", "Avg. MSE", "Std. MSE", "stride", "indim"])
mask_levels = ["C", "M", "F"]
num_trials = 5
for dataset in datasets:
preprocessor_cycstd = Preprocessor(dataset, False)
preprocessor_ord = PreprocessorOrdinal(dataset)
preprocessor_ohe = PreprocessorOneHot(dataset)
preprocessors = {"STD": preprocessor_cycstd, "ORD": preprocessor_ord, "OHE": preprocessor_ohe}
non_hierarchical_features = [col for col in preprocessor_ord.df_orig.columns if col not in preprocessor_ord.hierarchical_features]
for mask in mask_levels:
dir_path = f"generated/{dataset}/{mask}/"
path_real = os.path.join(dir_path, "real.csv")
df_real = pd.read_csv(path_real).drop(columns=['Unnamed: 0'])
df_real_scaled = preprocessors["ORD"].cleanDataset(dataset, df_real)
non_hierarchical_features = [col for col in df_real.columns if col not in preprocessor_ord.hierarchical_features]
for stride in [8]:
for encoding in encodings:
hit_accuracies = []
mses = []
indim = preprocessors[encoding].df_cleaned.values.shape[1]
for trial in range(num_trials):
if encoding == "OHE":
path_synth = os.path.join(dir_path, f"synth_wavestitch_pipeline_stride_{stride}_trial_{trial}_onehot_grad_simplecoeff.csv")
elif encoding == "ORD":
path_synth = os.path.join(dir_path, f'synth_wavestitch_pipeline_stride_{stride}_trial_{trial}_ordinal_grad_simplecoeff.csv')
elif encoding == "STD":
path_synth = os.path.join(dir_path, f'synth_wavestitch_pipeline_stride_{stride}_trial_{trial}_cycStd_grad_simplecoeff.csv')
df_synth = pd.read_csv(path_synth).drop(columns=['Unnamed: 0'])
cat_non_hier_columns = [col for col in non_hierarchical_features if col in preprocessor_ohe.onehot_encoded_columns]
real_non_hier_columns = [col for col in non_hierarchical_features if col not in preprocessor_ohe.onehot_encoded_columns]
if len(cat_non_hier_columns) > 0:
df_real_cat_filtered = df_real[cat_non_hier_columns]
df_synth_cat_filtered = df_synth[cat_non_hier_columns]
matched = (df_real_cat_filtered == df_synth_cat_filtered) | (pd.isna(df_real_cat_filtered) & pd.isna(df_synth_cat_filtered))
hit_accuracy = matched.sum().sum()/matched.size
hit_accuracies.append(hit_accuracy)
df_synth_scaled_rnh = preprocessors["ORD"].cleanDataset(dataset, df_synth)[real_non_hier_columns]
df_real_scaled_rnh = df_real_scaled[real_non_hier_columns]
mse = ((df_synth_scaled_rnh - df_real_scaled_rnh) ** 2).mean().mean()
mses.append(mse)
hit_accuracies = np.array(hit_accuracies)
mses = np.array(mses)
if len(hit_accuracies) > 0:
AVG_ACC, ACC_STDDEV = np.mean(hit_accuracies), np.std(hit_accuracies)
else:
AVG_ACC, ACC_STDDEV = np.nan, np.nan
AVG_MEAN, MEAN_STDDEV = np.mean(mses), np.std(mses)
row = {"Dataset": dataset, "Encoding": encoding, "Mask": mask,
"Avg. MSE": AVG_MEAN, "Std. MSE": MEAN_STDDEV, "stride": stride, 'indim': indim}
ablation_encoding_csv.loc[len(ablation_encoding_csv)] = row
path = "experiments/ablations/encoding/"
if not os.path.exists(path):
os.makedirs(path)
final_path = os.path.join(path, "ablation_encoding_wavestitch_grad_simplecoeff.csv")
ablation_encoding_csv.to_csv(final_path)