Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion app
Submodule app deleted from eaaf35
Empty file added app/__init__.py
Empty file.
63 changes: 63 additions & 0 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os
import io
import joblib
import pandas as pd
from flask import Flask, request, jsonify

from src.feature_engineering import engineer_features
from src.data_preprocessing import preprocess_data

app = Flask(__name__)

MODEL_PATH = os.path.join("models", "clustering_model.pkl")
PREPROCESSOR_PATH = os.path.join("models", "preprocessor.pkl")

# Load model and preprocessor once at startup (lazy on first request)
_model = None
_preprocessor = None


def _load_artifacts():
global _model, _preprocessor
if _model is None:
_model = joblib.load(MODEL_PATH)
if _preprocessor is None:
_preprocessor = joblib.load(PREPROCESSOR_PATH)


@app.route("/")
def index():
return jsonify({"status": "ok", "message": "User Profiling & Segmentation API"})


@app.route("/predict", methods=["POST"])
def predict():
"""
Accept a CSV file upload and return cluster segment labels.
Usage: POST /predict with form-data field 'file' containing a CSV.
"""
if "file" not in request.files:
return jsonify({"error": "No file part in the request"}), 400

file = request.files["file"]
if file.filename == "":
return jsonify({"error": "No file selected"}), 400

try:
df = pd.read_csv(io.StringIO(file.stream.read().decode("utf-8")))
except Exception as e:
return jsonify({"error": f"Failed to parse CSV: {str(e)}"}), 400

try:
_load_artifacts()
df = engineer_features(df)
X = _preprocessor.transform(df)
labels = _model.predict(X)
df["segment"] = labels
return jsonify({"segments": df["segment"].tolist()})
except Exception as e:
return jsonify({"error": str(e)}), 500


if __name__ == "__main__":
app.run(debug=False, host="0.0.0.0", port=5000)
7 changes: 6 additions & 1 deletion generate_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from src.feature_engineering import engineer_features
from src.clustering import cluster_users
import os
import joblib

# Load dataset
df = pd.read_csv("data/user_profiles_for_ads.csv")
Expand All @@ -12,11 +13,14 @@
df = engineer_features(df)

# Preprocess features
X, _ = preprocess_data(df)
X, preprocessor = preprocess_data(df)

# Make sure models folder exists
os.makedirs("models", exist_ok=True)

# Save the preprocessor for use during predictions
joblib.dump(preprocessor, "models/preprocessor.pkl")

# Perform clustering and save model
labels = cluster_users(X, k=4)

Expand All @@ -26,4 +30,5 @@
df.to_csv("outputs/segmented_users.csv", index=False)

print("✅ clustering_model.pkl saved to 'models/'")
print("✅ preprocessor.pkl saved to 'models/'")
print("✅ Segmented users saved to 'outputs/segmented_users.csv'")
3 changes: 1 addition & 2 deletions run_app.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
#!/bin/bash
export FLASK_APP=app/main.py
flask run
python app/main.py
Empty file added src/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion src/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import joblib

def cluster_users(X, k=4):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)
joblib.dump(kmeans, 'models/clustering_model.pkl')
return labels