diff --git a/app b/app deleted file mode 160000 index eaaf35b..0000000 --- a/app +++ /dev/null @@ -1 +0,0 @@ -Subproject commit eaaf35be10af4fe50999324ee308580169b4fba5 diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..dc96f30 --- /dev/null +++ b/app/main.py @@ -0,0 +1,63 @@ +import os +import io +import joblib +import pandas as pd +from flask import Flask, request, jsonify + +from src.feature_engineering import engineer_features +from src.data_preprocessing import preprocess_data + +app = Flask(__name__) + +MODEL_PATH = os.path.join("models", "clustering_model.pkl") +PREPROCESSOR_PATH = os.path.join("models", "preprocessor.pkl") + +# Load model and preprocessor once at startup (lazy on first request) +_model = None +_preprocessor = None + + +def _load_artifacts(): + global _model, _preprocessor + if _model is None: + _model = joblib.load(MODEL_PATH) + if _preprocessor is None: + _preprocessor = joblib.load(PREPROCESSOR_PATH) + + +@app.route("/") +def index(): + return jsonify({"status": "ok", "message": "User Profiling & Segmentation API"}) + + +@app.route("/predict", methods=["POST"]) +def predict(): + """ + Accept a CSV file upload and return cluster segment labels. + Usage: POST /predict with form-data field 'file' containing a CSV. + """ + if "file" not in request.files: + return jsonify({"error": "No file part in the request"}), 400 + + file = request.files["file"] + if file.filename == "": + return jsonify({"error": "No file selected"}), 400 + + try: + df = pd.read_csv(io.StringIO(file.stream.read().decode("utf-8"))) + except Exception as e: + return jsonify({"error": f"Failed to parse CSV: {str(e)}"}), 400 + + try: + _load_artifacts() + df = engineer_features(df) + X = _preprocessor.transform(df) + labels = _model.predict(X) + df["segment"] = labels + return jsonify({"segments": df["segment"].tolist()}) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +if __name__ == "__main__": + app.run(debug=False, host="0.0.0.0", port=5000) diff --git a/generate_model.py b/generate_model.py index dcd17b2..65102ac 100644 --- a/generate_model.py +++ b/generate_model.py @@ -4,6 +4,7 @@ from src.feature_engineering import engineer_features from src.clustering import cluster_users import os +import joblib # Load dataset df = pd.read_csv("data/user_profiles_for_ads.csv") @@ -12,11 +13,14 @@ df = engineer_features(df) # Preprocess features -X, _ = preprocess_data(df) +X, preprocessor = preprocess_data(df) # Make sure models folder exists os.makedirs("models", exist_ok=True) +# Save the preprocessor for use during predictions +joblib.dump(preprocessor, "models/preprocessor.pkl") + # Perform clustering and save model labels = cluster_users(X, k=4) @@ -26,4 +30,5 @@ df.to_csv("outputs/segmented_users.csv", index=False) print("✅ clustering_model.pkl saved to 'models/'") +print("✅ preprocessor.pkl saved to 'models/'") print("✅ Segmented users saved to 'outputs/segmented_users.csv'") diff --git a/run_app.sh b/run_app.sh index ce531a2..0dd5156 100644 --- a/run_app.sh +++ b/run_app.sh @@ -1,3 +1,2 @@ #!/bin/bash -export FLASK_APP=app/main.py -flask run +python app/main.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/clustering.py b/src/clustering.py index d240f09..52b612f 100644 --- a/src/clustering.py +++ b/src/clustering.py @@ -3,7 +3,7 @@ import joblib def cluster_users(X, k=4): - kmeans = KMeans(n_clusters=k, random_state=42) + kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) labels = kmeans.fit_predict(X) joblib.dump(kmeans, 'models/clustering_model.pkl') return labels