-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPreprocessor.py
More file actions
108 lines (83 loc) · 2.77 KB
/
Preprocessor.py
File metadata and controls
108 lines (83 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import time
import numpy as np
from enum import Enum
from sklearn import random_projection, feature_selection, decomposition
from sklearn import preprocessing
class Reducers(Enum):
none = 0
random_projection = 1
feature_selection = 2
pca = 3
## DEFAULTS:
normalize_default = False
reduce_features_defualt = False
reducer_type_default = Reducers.none
explained_variance_default = 0.95
class Preprocessor:
def __init__(self,
normalize=normalize_default,
reduce_features=reduce_features_defualt,
reducer_type=reducer_type_default,
explained_variance=explained_variance_default):
self.normalize = normalize
self.reduce_features = reduce_features
self.reducer_type = reducer_type
self.explained_variance = explained_variance
if reduce_features:
assert(reducer_type != Reducers.none)
self.normalizer = None
self.reducer = None
def train(self, features):
# Setup:
start_time = time.time()
# Check feature set:
assert(np.isfinite(features).all())
# Normalizer:
if self.normalize:
standardizer = preprocessing.StandardScaler()
features = standardizer.fit_transform(features)
self.normalizer = standardizer
# Option 1 (Random Projection):
if self.reducer_type == Reducers.random_projection:
transformer = random_projection.GaussianRandomProjection()
transformer.fit(features)
self.reducer = transformer
# Option 2 (Feature Selection):
if self.reducer_type == Reducers.feature_selection:
threshold = (self.explained_variance) * (1 - self.explained_variance)
selector = feature_selection.VarianceThreshold(threshold=threshold)
selector.fit(features)
self.reducer = selector
# Option 3 (PCA):
if self.reducer_type == Reducers.pca:
pca = decomposition.PCA(n_components=self.explained_variance, svd_solver="full")
pca.fit(features)
self.reducer = pca
# Calculate elapsed time:
end_time = time.time()
elapsed_time = end_time - start_time
print("Training preprocessor took %.2f seconds" % elapsed_time)
def process(self, features):
# Setup:
start_time = time.time()
initial_feature_size = features.shape[1]
# Check feature set:
# assert(np.isfinite(features).all())
if not np.isfinite(features).all():
print("Error. Invalid features.")
# Check args:
if not self.normalize and not self.reduce_features:
print("No preprocessing done.")
return features
# Standardize:
if self.normalize:
features = self.normalizer.transform(features)
# Reduce features:
if self.reduce_features:
features = self.reducer.transform(features)
# Calculate elapsed time:
end_time = time.time()
elapsed_time = end_time - start_time
print("Preprocessing took %.2f seconds" % elapsed_time)
print("Reduced feature size from %d to %d" % (initial_feature_size, features.shape[1]))
return features