-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
101 lines (84 loc) · 2.68 KB
/
Copy pathmain.py
File metadata and controls
101 lines (84 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
from features import extract_receipt_features, split_train_test
from models import train_all_models, rank_products
from evaluation import plot_confusion_and_metrics, compute_baseline_ranking, evaluate_hit_rate, plot_baseline_comparison
sauces = [
"Crazy Sauce", "Cheddar Sauce", "Extra Cheddar Sauce",
"Garlic Sauce", "Tomato Sauce", "Blueberry Sauce",
"Spicy Sauce", "Pink Sauce",
]
df = pd.read_csv("ap_dataset.csv")
# cate bonuri (distincte) contin fiecare sos
sauce_receipts = (
df[df["retail_product_name"].isin(sauces)]
.groupby("retail_product_name")["id_bon"]
.nunique()
.sort_values(ascending=False)
)
receipt_df, product_cols, product_prices = extract_receipt_features(df, exclude_from_aggregates=sauces)
train_df, test_df = split_train_test(receipt_df)
print(train_df.columns.tolist())
# exit()
row = test_df.sample(1, random_state=42).iloc[0]
full_cart = [p for p in product_cols if row.get(p, 0) > 0]
partial_cart = [p for p in full_cart if p not in sauces]
numeric_features = {
"cart_size": len(partial_cart),
"total_value": float(row["total_value"]),
"is_weekend": float(row["is_weekend"]),
"is_morning": float(row["is_morning"]),
"is_afternoon": float(row["is_afternoon"]),
"is_evening": float(row["is_evening"]),
"is_night": float(row["is_night"]),
}
algorithms = ["logistic_regression", "naive_bayes", "id3", "adaboost"]
model_store = train_all_models(
train_df,
sauces,
algorithms,
exclude_columns=sauces,
)
# RANKING
for algo in algorithms:
print(f"\n{algo.upper()}")
ranking = rank_products(
model_store,
algo,
sauces,
partial_cart,
product_prices,
numeric_features,
)
for i, (prod, score) in enumerate(ranking, 1):
print(f"{i}. {prod} => {score:.3f}")
# PLOTS
for algo in algorithms:
for prod in sauces:
entry = model_store[(algo, prod)]
plot_confusion_and_metrics(
entry["model"],
entry["feature_cols"],
test_df,
prod,
algo,
)
# calculez baseline-ul dupa popularitate
baseline_ranking = compute_baseline_ranking(df, sauces, method="popularity")
print("\n BASELINE RANKING dupa popularitate:")
for i, sauce in enumerate(baseline_ranking, 1):
print(f"{i}. {sauce}")
# evaluez Hit Rate @ K pentru toate metodele
k_values = [1, 3, 5]
hit_rate_results, total_evaluated = evaluate_hit_rate(
test_df,
sauces,
product_cols,
product_prices,
model_store,
algorithms,
baseline_ranking,
k_values=k_values,
)
print(f"\nBonuri evaluate: {total_evaluated}")
# creez graficul
plot_baseline_comparison(hit_rate_results, k_values)