-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfirst_method_custom_functions.py
More file actions
193 lines (178 loc) · 9.55 KB
/
first_method_custom_functions.py
File metadata and controls
193 lines (178 loc) · 9.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
%%writefile custom_tools.py
import numpy as np
import pandas as pd
from collections import defaultdict
dict_additional_columns = {
'color_RED': ['RED'],
'color_PINK': ['PINK', 'FUSHIA'],
'color_BLUE': ['BLUE'],
'color_GREEN': ['GREEN'],
'color_WHITE': ['WHITE', 'WHIT'],
'color_BLACK': ['BLACK', 'BLK', 'BLCK'],
'color_CREAM': ['CREAM'],
'color_GOLD': ['GOLD'],
'color_SILVER': ['SILVER'],
'color_COOPER': ['COOPER'],
'color_ASSORTED': ['ASSORTED'],
'cat_POSTAGE': ['POSTAGE'],
'cat_SAMPLES': ['SAMPLES'],
'cat_MANUAL': ['Manual'],
'cat_FEES': ['Bank Charges', 'bank charges', 'AMAZON FEE', 'CRUK Commission'],
'cat_ADJUSTMENT': ['Adjustment'],
'cat_SET': ['SET'],
# 'cat_FURNITURE': ['DRAWER', 'CABINET', 'DRESSER', 'SEAT', 'SIDEBOARD', 'MIRROR', 'TABLE'],
'cat_MATERIAL': ['QUILT', 'FLAG'],
'material_WOOD': ['WOOD'],
'material_CERAMIC': ['CERAMIC'],
'material_GLASS': ['GLASS'],
'material_GEM': ['AMETHYST', 'DIAMANTE', 'RUBY', 'AMBER', 'TURQUISE', 'QUARTZ', 'GEMSTONE', 'CRYSTAL', 'JADE'],
'material_ENAMEL': ['ENAMEL'],
'material_METAL': ['COOPER', 'ZINC', 'BRONZE'],
'style_RETRO': ['RETRO'],
'style_VINTAGE': ['VINTAGE'],
'style_HISTORIC': ['EDWARDIAN', 'FRENCH', 'BAROQUE', 'MOROCCAN', 'ANTIQUE', 'ANT', 'RUSTIC', 'REGENCY'],
'style_MODERN': ['MODERN', 'SCANDINAVIAN'],
'type_JEWELRY': ['NECKLAGE', 'BEAD', 'RING', 'JEWEL', 'BRACELET'],
'type_CHRISTMAS': ['CHRISTMAS'],
'type_BAG': ['BAG'],
'type_CONTAINER': ['TIN', 'BOX', 'CHEST', 'JAR']
}
dict_additional_columns.keys()
countries = ['Saudi Arabia', 'Czech Republic', 'Nigeria', 'Bermuda', 'West Indies', 'Lebanon', 'European Community',
'Korea', 'Thailand', 'Brazil']
def prepare_train_and_test(train, test):
test['is_canceled'] = np.NaN
df_all = pd.concat([train, test], sort=False)
# I didn't use those in the best result but I left them just in case you were curious what I was testing
#df_all = prepare_product_customer_statistics(df_all)
#df_all['country_aggregated'] = df_all['country'].apply(lambda x: 'Other' if any([country in x for country in countries]) else x)
#df_all['cat_country'] = pd.factorize(df_all['country_aggregated'])[0]
new_train_all_rows = df_all[~df_all['is_canceled'].isnull()].copy()
new_test_all_rows = df_all[df_all['is_canceled'].isnull()].copy()
new_train = prepare_dataframe(new_train_all_rows, train=True)
new_test = prepare_dataframe(new_test_all_rows, train=False)
return new_train, new_test
def prepare_product_customer_statistics(df_all):
def group_to_dict(group_key, agg_func=np.sum):
train = df_all[ ~df_all['is_canceled'].isnull()]
dict_ = train.groupby(group_key)['is_canceled'].agg(agg_func).to_dict()
if -1 in dict_:
del dict_[-1]
mean = np.mean( list(dict_.values()) )
return defaultdict(lambda: mean, dict_)
df_all['cnt_p_product_cancel'] = df_all['stock_code'].map(group_to_dict('stock_code')).astype('float64')
#df_all['cnt_p_product_cancel_country'] = df_all['stock_code'].map(group_to_dict(['stock_code', 'country'])).astype('float64')
df_all['cnt_p_product_orders'] = df_all['stock_code'].map(group_to_dict('stock_code', agg_func=np.size))
df_all['ratio_p_product_orders'] = (df_all['cnt_p_product_cancel']/df_all['cnt_p_product_orders']).round(5)
df_all['cnt_customer_cancel'] = df_all['customer_id'].map(group_to_dict('customer_id')).astype('float64')
df_all['cnt_customer_orders'] = df_all['customer_id'].map(group_to_dict('customer_id', agg_func=np.size))
df_all['ratio_customer_orders'] = (df_all['cnt_customer_cancel']/df_all['cnt_customer_orders']).round(5)
return df_all
def prepare_dataframe(df, train=False):
prepared_df = create_orders_df(df, train)
prepared_df = get_invoice_date_parameters(prepared_df)
prepared_df = get_additional_bool_column_from_description(prepared_df, df)
#prepared_df = one_hot_encoding(prepared_df, 'country')
return prepared_df
def prepare_additional_features(df, rows_df):
df_all['cnt_p_product_cancel'] = df_all['stock_code'].map(group_to_dict('stock_code')).astype('float64')
#df_all['cnt_p_product_cancel_country'] = df_all['stock_code'].map(group_to_dict(['stock_code', 'country'])).astype('float64')
df_all['cnt_p_product_orders'] = df_all['stock_code'].map(group_to_dict('stock_code', agg_func=np.size))
df_all['ratio_p_product_orders'] = (df_all['cnt_p_product_cancel']/df_all['cnt_p_product_orders']).round(5)
def group_to_dict(group_key, column, agg_func=np.sum):
dict_ = df.groupby(group_key)[column].agg(agg_func).to_dict()
if -1 in dict_:
del dict_[-1]
mean = np.mean( list(dict_.values()) )
return defaultdict(lambda: mean, dict_)
df['different_items'] = df['invoice'].map(group_to_dict('invoice','stock_code', agg_func=np.size))
#df['all_quantity'] = df['invoice'].map(group_to_dict('invoice','quantity', agg_func=np.sum))
df['price_unit_median'] = df['invoice'].map(group_to_dict('invoice', 'price_unit', agg_func=np.median))
df['log_price_full_invoice'] = np.log2(df['invoice'].map(group_to_dict('invoice', 'price_total', agg_func=np.sum)) + 6)
df['max_return_product_invoice'] = df['invoice'].map(group_to_dict('invoice', 'cnt_p_product_cancel', agg_func=np.max))
df['ratio_p_product_orders'] = df['invoice'].map(group_to_dict('invoice', 'ratio_p_product_orders', agg_func=np.max))
return df
def get_additional_column_from_description(prepared_df, df):
columns = list(dict_additional_columns.keys())
prepared_df[columns] = prepared_df.apply(lambda row: check_for_string(row.name, df, columns), axis=1)
return prepared_df
def get_additional_bool_column_from_description(prepared_df, df):
df['description'] = df['description'].astype(str)
prepared_df['joined_descriptions'] = prepared_df.apply(lambda row: ' | '.join(df[df['invoice'] == row.name]['description']), axis=1)
for column, words in dict_additional_columns.items():
prepared_df[column] = prepared_df['joined_descriptions'].apply(lambda x: any([word in x for word in words]))
return prepared_df
def create_orders_df(df, train=False):
columns_count = ['quantity']
columns_sum = ['price_total', 'quantity']
columns_first = ['invoice_date', 'country_aggregated', 'customer_id', 'is_test']
#, 'cnt_customer_cancel', 'cnt_customer_orders', 'ratio_customer_orders']
columns_calculations = ['price_unit', 'price_total']
#, 'cnt_p_product_cancel', 'cnt_p_product_orders', 'ratio_p_product_orders']
if train:
columns_sum.append('is_canceled')
orders = df.groupby('invoice')[columns_sum].sum()
if train:
orders['is_canceled'] = orders['is_canceled'] > 0
orders['total_return'] = orders['price_total'] * orders['is_canceled']
orders_temp = df.groupby('invoice')[columns_first].first()
orders = orders.join(orders_temp, how='inner')
for column in columns_calculations:
df[column] = df[column].astype(np.float32)
grouped_single = df.groupby('invoice').agg({column: ['mean', 'min', 'max', 'median', 'std']})
grouped_single.columns = [f'{column}_{calculation}' for calculation in ['mean', 'min', 'max', 'median', 'std']]
grouped_single[f'{column}_std'] = grouped_single[f'{column}_std'].round(2).fillna(-1)
if column != 'price_total':
grouped_single[f'{column}_min_max_diff'] = grouped_single[f'{column}_max'] - grouped_single[f'{column}_min']
orders = orders.join(grouped_single, how='inner')
orders_temp = df.groupby('invoice')[columns_count].count()
orders_temp = orders_temp.rename(columns={'quantity': 'different_items'})
orders = orders.join(orders_temp, how='inner')
orders['unknown_buyer'] = np.where(orders['customer_id'] == -1, True, False)
orders['log_price_total'] = np.log2(orders['price_total'] + 6)
return orders
def get_invoice_date_parameters(df):
#df['year'] = df['invoice_date'].dt.year
df['month'] = df['invoice_date'].dt.month
#df['day'] = df['invoice_date'].dt.day
df['hour'] = df['invoice_date'].dt.hour
#df['minute'] = df['invoice_date'].dt.minute
#df['day_of_year'] = df['invoice_date'].dt.dayofyear
df['day_of_week'] = df['invoice_date'].dt.dayofweek
#df['week_of_year'] = df['invoice_date'].dt.weekofyear
#df['quarter'] = df['invoice_date'].dt.quarter
df['weekend'] = np.where(df['day_of_week'] < 5, 0, 1)
#df['parT_of_day'] = df['hour'].apply(lambda c: get_part_of_day(c))
return df
def get_part_of_day(hour):
return (
0 if 5 <= hour <= 11
else
1 if 12 <= hour <= 17
else
2 if 18 <= hour <= 22
else
3
)
def one_hot_encoding(df, column):
df[column] = pd.Categorical(df[column])
df = pd.concat([df, pd.get_dummies(df[column], prefix=column)], axis=1)
return df
def get_features(df, black_list=['total_return', 'is_canceled', 'is_test', 'price_unit_std', 'day_of_week']):
feats = df.select_dtypes(include=[np.number, 'bool']).columns
return [x for x in feats if x not in black_list]
def tidy_split(df, column, sep='|', keep=False):
indexes = list()
new_values = list()
df = df.dropna(subset=[column])
for i, presplit in enumerate(df[column].astype(str)):
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(presplit)
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df