-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfeatures.py
More file actions
74 lines (57 loc) · 2.33 KB
/
Copy pathfeatures.py
File metadata and controls
74 lines (57 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
from sklearn.model_selection import train_test_split
def extract_receipt_features(
df,
id_col="id_bon",
product_col="retail_product_name",
price_col="SalePriceWithVAT",
datetime_col="data_bon",
binary_products=True,
exclude_from_aggregates=None,
):
df = df.copy()
df[datetime_col] = pd.to_datetime(df[datetime_col], errors="coerce")
product_prices = (
df.dropna(subset=[product_col, price_col])
.drop_duplicates(subset=[product_col])
.set_index(product_col)[price_col]
.to_dict()
)
tmp = df[[id_col, product_col]].copy()
tmp["__cnt__"] = 1
receipt_products = tmp.pivot_table(
index=id_col,
columns=product_col,
values="__cnt__",
aggfunc="sum",
fill_value=0
).astype(float)
if binary_products:
receipt_products = (receipt_products > 0).astype(float)
product_cols = receipt_products.columns.tolist()
exclude_set = set(exclude_from_aggregates or [])
df_agg = df if not exclude_set else df.loc[~df[product_col].isin(exclude_set)]
receipt_time = df.groupby(id_col).agg(min_time=(datetime_col, "min"))
receipt_agg = df_agg.groupby(id_col).agg(
cart_size=(product_col, "count"),
total_value=(price_col, "sum"),
)
receipt_numeric = receipt_time.join(receipt_agg, how="left")
receipt_numeric[["cart_size", "total_value"]] = receipt_numeric[["cart_size", "total_value"]].fillna(0)
hour = receipt_numeric["min_time"].dt.hour
day_of_week = receipt_numeric["min_time"].dt.dayofweek
receipt_numeric["is_weekend"] = (day_of_week >= 5).astype(int)
receipt_numeric["is_morning"] = ((hour >= 6) & (hour < 12)).astype(int)
receipt_numeric["is_afternoon"] = ((hour >= 12) & (hour < 18)).astype(int)
receipt_numeric["is_evening"] = ((hour >= 18) & (hour < 24)).astype(int)
receipt_numeric["is_night"] = ((hour >= 0) & (hour < 6)).astype(int)
receipt_numeric = receipt_numeric.drop(columns=["min_time"])
receipt_df = receipt_products.join(receipt_numeric, how="left").fillna(0)
return receipt_df, product_cols, product_prices
def split_train_test(receipt_df, test_size=0.2, random_state=42):
return train_test_split(
receipt_df,
test_size=test_size,
random_state=random_state,
shuffle=True,
)