From 6a3034041d161714f5e4ee52f1a0ba46d2a03817 Mon Sep 17 00:00:00 2001
From: Net Zhang <zhang.11091@osu.edu>
Date: Tue, 2 Jun 2026 10:58:59 -0400
Subject: [PATCH 1/6] Remove FAISS backend

The FAISS KMeans backend added meaningful installation weight and
startup import noise for a marginal benefit. Removing it simplies the
backend selection logic to two cases:
- cuML if GPU available
- else sklearn

Changes
- Drop `faiss-cpu` & `faiss-gpu-cu12` from main deps and `gpu-*` extras
- Remove FAISS from backend scripts `resolve_brackend()`, `run_kmeans()` dispatch
- Remove "faiss" from clustering backend dropdowns in the webUI
- Update README & BACKEND_PIPELINE doc to reflect the changes
---
 README.md                                |  2 +-
 docs/BACKEND_PIPELINE.md                 | 14 ++---
 pyproject.toml                           | 12 ++--
 shared/components/clustering_controls.py |  9 +--
 shared/services/clustering_service.py    |  4 +-
 shared/utils/__init__.py                 |  2 +-
 shared/utils/backend.py                  | 21 +------
 shared/utils/clustering.py               | 77 +++---------------------
 8 files changed, 24 insertions(+), 117 deletions(-)

diff --git a/README.md b/README.md
index 127a842..4e96445 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ uv pip install -e ".[gpu-cu12]"
 uv pip install -e ".[gpu-cu13]"
 ```
 
-The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. You can also manually select backends (cuML, FAISS, sklearn) in the sidebar.
+The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. You can also manually select backends (cuML, sklearn) in the sidebar.
 
 ## Usage
 
diff --git a/docs/BACKEND_PIPELINE.md b/docs/BACKEND_PIPELINE.md
index 43c1209..0f298d8 100644
--- a/docs/BACKEND_PIPELINE.md
+++ b/docs/BACKEND_PIPELINE.md
@@ -12,11 +12,11 @@ Raw Embeddings (from parquet or model)
   ├─ L2 Normalize: project onto unit hypersphere
   │
   ├─► Step 1: KMeans Clustering (high-dimensional)
-  │     Backend: cuML → FAISS → sklearn
+  │     Backend: cuML (GPU) → sklearn (CPU)
   │
   ├─► Step 2: Dimensionality Reduction to 2D
   │     Method:  PCA / t-SNE / UMAP
-  │     Backend: cuML → sklearn
+  │     Backend: cuML (GPU) → sklearn (CPU)
   │
   └─► Scatter Plot (Altair)
         Color = cluster, position = 2D projection
@@ -46,10 +46,9 @@ feature space, not a lossy 2D projection.
 | Backend | When It's Used | How It Works |
 |---------|---------------|--------------|
 | **cuML** | GPU available + >500 samples | GPU-accelerated KMeans via RAPIDS. Runs on CuPy arrays. Falls back to sklearn on any error. |
-| **FAISS** | No GPU + >500 samples | Facebook's optimized CPU KMeans using L2 index. Fast for medium datasets. Falls back to sklearn on error. |
-| **sklearn** | Small datasets or fallback | Standard scikit-learn KMeans. Always works, no special dependencies. |
+| **sklearn** | CPU path (default on machines without a GPU) | Standard scikit-learn KMeans. Always works, no special dependencies. |
 
-**Auto-selection priority:** cuML > FAISS > sklearn. You can override in the sidebar.
+**Auto-selection priority:** cuML > sklearn. You can override in the sidebar.
 
 ## Step 2: Dimensionality Reduction
 
@@ -96,7 +95,7 @@ When you select "auto" (the default), the app picks the fastest available backen
 
 | Operation | Auto Logic |
 |-----------|-----------|
-| KMeans | cuML if GPU + >500 samples, else FAISS if available + >500 samples, else sklearn |
+| KMeans | cuML if GPU + >500 samples, else sklearn |
 | Dim. Reduction | cuML if GPU + >5000 samples, else sklearn |
 
 Any GPU error (architecture mismatch, missing libraries, out of memory (OOM)) triggers an
@@ -122,9 +121,6 @@ Check the log file for the full picture when debugging.
 cuML (GPU)
   │ error?
   ▼
-FAISS (CPU, optimized)     ← KMeans only
-  │ error?
-  ▼
 sklearn (CPU, always works)
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
index 24f89af..c793896 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,8 +42,7 @@ dependencies = [
     # Machine learning
     "scikit-learn>=1.0.0",
     "umap-learn>=0.5.0",
-    "numba>=0.57.0", 
-    "faiss-cpu>=1.7.0",
+    "numba>=0.57.0",
     # Vision-language models
     "open-clip-torch>=2.20.0",
     # Custom inference package
@@ -69,20 +68,17 @@ gpu = [
 ]
 gpu-cu12 = [
     "torch>=2.0.0",
-    "cuml-cu12>=25.6",
-    "faiss-gpu-cu12>=1.11.0",
+    "cuml-cu12>=26.4",  # 26.4 removed the sklearn upper bound (compatible with sklearn>=1.8)
     "pynvml>=11.0.0",
 ]
 gpu-cu13 = [
     "torch>=2.0.0",
-    "cuml-cu13>=25.12",
-    "faiss-gpu-cu12>=1.11.0",  # no cu13 build on PyPI; cu12 works via CUDA backward compat
+    "cuml-cu13>=26.4",  # 26.4 removed the sklearn upper bound (compatible with sklearn>=1.8)
     "pynvml>=11.0.0",
 ]
-# Minimal GPU support (just PyTorch + FAISS GPU, no RAPIDS)
+# Minimal GPU support for image embeddings generation (just PyTorch, no RAPIDS)
 gpu-minimal = [
     "torch>=2.0.0",
-    "faiss-gpu-cu12>=1.11.0",
 ]
 all = [
     "emb-explorer[dev,gpu]",
diff --git a/shared/components/clustering_controls.py b/shared/components/clustering_controls.py
index d18eb5d..34d3e1c 100644
--- a/shared/components/clustering_controls.py
+++ b/shared/components/clustering_controls.py
@@ -5,7 +5,7 @@
 import streamlit as st
 from typing import Tuple, Optional
 
-from shared.utils.backend import HAS_FAISS_PACKAGE, HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE
+from shared.utils.backend import HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE
 
 
 def render_clustering_backend_controls():
@@ -19,9 +19,6 @@ def render_clustering_backend_controls():
     dim_reduction_options = ["auto", "sklearn"]
     clustering_options = ["auto", "sklearn"]
 
-    if HAS_FAISS_PACKAGE:
-        clustering_options.append("faiss")
-
     if HAS_CUML_PACKAGE and HAS_CUPY_PACKAGE:
         dim_reduction_options.append("cuml")
         clustering_options.append("cuml")
@@ -73,7 +70,7 @@ def render_clustering_backend_controls():
                 max_value=64, 
                 value=8, 
                 step=1,
-                help="Number of parallel workers for CPU backends (sklearn, FAISS). Not used by cuML (GPU manages parallelization automatically)."
+                help="Number of parallel workers for CPU sklearn. Not used by cuML (GPU manages parallelization automatically)."
             )
         
     
@@ -118,8 +115,6 @@ def render_kmeans_controls():
         Tuple of (clustering_backend, n_workers, seed)
     """
     clustering_options = ["auto", "sklearn"]
-    if HAS_FAISS_PACKAGE:
-        clustering_options.append("faiss")
     if HAS_CUML_PACKAGE and HAS_CUPY_PACKAGE:
         clustering_options.append("cuml")
 
diff --git a/shared/services/clustering_service.py b/shared/services/clustering_service.py
index 150bcfc..6847b9f 100644
--- a/shared/services/clustering_service.py
+++ b/shared/services/clustering_service.py
@@ -38,8 +38,8 @@ def run_clustering(
             n_clusters: Number of clusters
             reduction_method: Dimensionality reduction method
             n_workers: Number of workers for reduction
-            dim_reduction_backend: Backend for dimensionality reduction ("auto", "sklearn", "faiss", "cuml")
-            clustering_backend: Backend for clustering ("auto", "sklearn", "faiss", "cuml")
+            dim_reduction_backend: Backend for dimensionality reduction ("auto", "sklearn", "cuml")
+            clustering_backend: Backend for clustering ("auto", "sklearn", "cuml")
             seed: Random seed for reproducibility (None for random)
 
         Returns:
diff --git a/shared/utils/__init__.py b/shared/utils/__init__.py
index b305aa7..2ae7a57 100644
--- a/shared/utils/__init__.py
+++ b/shared/utils/__init__.py
@@ -2,7 +2,7 @@
 Shared utilities for clustering, IO, models, and taxonomy.
 
 Modules are imported lazily to avoid pulling in heavy dependencies
-(sklearn, umap, faiss, cuml, torch, open_clip) at startup.
+(sklearn, umap, cuml, torch, open_clip) at startup.
 Use direct imports instead:
 
     from shared.utils.clustering import reduce_dim, run_kmeans
diff --git a/shared/utils/backend.py b/shared/utils/backend.py
index ed66cad..c75db3f 100644
--- a/shared/utils/backend.py
+++ b/shared/utils/backend.py
@@ -20,7 +20,6 @@
 # These are safe to call at module-load / render time — they only check
 # whether the package is installed, without executing it.
 
-HAS_FAISS_PACKAGE: bool = importlib.util.find_spec("faiss") is not None
 HAS_CUML_PACKAGE: bool = importlib.util.find_spec("cuml") is not None
 HAS_CUPY_PACKAGE: bool = importlib.util.find_spec("cupy") is not None
 HAS_TORCH_PACKAGE: bool = importlib.util.find_spec("torch") is not None
@@ -84,27 +83,16 @@ def check_cuml_available() -> bool:
         return False
 
 
-def check_faiss_available() -> bool:
-    """Check if FAISS is available (actual import, for runtime use)."""
-    if not HAS_FAISS_PACKAGE:
-        return False
-    try:
-        import faiss
-        return True
-    except ImportError:
-        return False
-
-
 def resolve_backend(backend: str, operation: str = "general") -> str:
     """
     Resolve 'auto' backend to actual backend based on available hardware.
 
     Args:
-        backend: Requested backend ("auto", "sklearn", "cuml", "faiss")
+        backend: Requested backend ("auto", "sklearn", "cuml")
         operation: Operation type for logging ("clustering", "reduction", "general")
 
     Returns:
-        Resolved backend name
+        Resolved backend name. CPU paths always go through sklearn.
     """
     if backend != "auto":
         logger.debug(f"Using explicitly requested backend: {backend}")
@@ -112,14 +100,10 @@ def resolve_backend(backend: str, operation: str = "general") -> str:
 
     cuda_available, device_info = check_cuda_available()
     has_cuml = check_cuml_available()
-    has_faiss = check_faiss_available()
 
     if cuda_available and has_cuml:
         resolved = "cuml"
         logger.info(f"Auto-resolved {operation} backend to cuML (GPU: {device_info})")
-    elif has_faiss:
-        resolved = "faiss"
-        logger.info(f"Auto-resolved {operation} backend to FAISS (CPU)")
     else:
         resolved = "sklearn"
         logger.info(f"Auto-resolved {operation} backend to sklearn (CPU)")
@@ -140,7 +124,6 @@ def get_backend_info() -> dict:
         "cuda_available": cuda_available,
         "device_info": device_info,
         "cuml_available": check_cuml_available(),
-        "faiss_available": check_faiss_available(),
     }
 
 
diff --git a/shared/utils/clustering.py b/shared/utils/clustering.py
index f144f4b..ad9fc2e 100644
--- a/shared/utils/clustering.py
+++ b/shared/utils/clustering.py
@@ -8,8 +8,8 @@
 
 from shared.utils.logging_config import get_logger
 from shared.utils.backend import (
-    HAS_FAISS_PACKAGE, HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE,
-    check_cuda_available, check_cuml_available, check_faiss_available,
+    HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE,
+    check_cuda_available, check_cuml_available,
 )
 
 logger = get_logger(__name__)
@@ -17,7 +17,6 @@
 # Legacy module-level flags — now backed by lightweight find_spec() checks
 # so importing this module no longer triggers heavy library loads.
 # Functions that actually need the libraries import them locally.
-HAS_FAISS: bool = HAS_FAISS_PACKAGE
 HAS_CUML: bool = HAS_CUML_PACKAGE and HAS_CUPY_PACKAGE
 HAS_CUDA: bool = False  # resolved lazily via check_cuda_available()
 
@@ -342,8 +341,8 @@ def run_kmeans(embeddings: np.ndarray, n_clusters: int, seed: Optional[int] = No
         embeddings (np.ndarray): The input feature embeddings of shape (n_samples, n_features).
         n_clusters (int): The number of clusters to form.
         seed (int, optional): Random seed for reproducibility. Defaults to None (random).
-        n_workers (int, optional): Number of parallel workers (used by FAISS and cuML if available).
-        backend (str, optional): Clustering backend - "auto", "sklearn", "faiss", or "cuml". Defaults to "auto".
+        n_workers (int, optional): Number of parallel workers (used by cuML if available).
+        backend (str, optional): Clustering backend - "auto", "sklearn", or "cuml". Defaults to "auto".
 
     Returns:
         kmeans (KMeans or custom object): The fitted clustering object.
@@ -362,20 +361,9 @@ def run_kmeans(embeddings: np.ndarray, n_clusters: int, seed: Optional[int] = No
     if backend == "cuml" and HAS_CUML and cuda_available:
         logger.info("Using cuML backend for KMeans")
         result = _run_kmeans_cuml(embeddings, n_clusters, seed, n_workers)
-    elif backend == "faiss" and HAS_FAISS:
-        logger.info("Using FAISS backend for KMeans")
-        result = _run_kmeans_faiss(embeddings, n_clusters, seed, n_workers)
-    elif backend == "auto":
-        # Auto selection priority: cuML > FAISS > sklearn
-        if HAS_CUML and cuda_available and n_samples > 500:
-            logger.info("Auto-selected cuML backend for KMeans (GPU available, large dataset)")
-            result = _run_kmeans_cuml(embeddings, n_clusters, seed, n_workers)
-        elif HAS_FAISS and n_samples > 500:
-            logger.info("Auto-selected FAISS backend for KMeans (large dataset)")
-            result = _run_kmeans_faiss(embeddings, n_clusters, seed, n_workers)
-        else:
-            logger.info("Using sklearn backend for KMeans")
-            result = _run_kmeans_sklearn(embeddings, n_clusters, seed)
+    elif backend == "auto" and HAS_CUML and cuda_available and n_samples > 500:
+        logger.info("Auto-selected cuML backend for KMeans (GPU available, large dataset)")
+        result = _run_kmeans_cuml(embeddings, n_clusters, seed, n_workers)
     else:
         logger.info("Using sklearn backend for KMeans")
         result = _run_kmeans_sklearn(embeddings, n_clusters, seed)
@@ -443,55 +431,4 @@ def _run_kmeans_sklearn(embeddings: np.ndarray, n_clusters: int, seed: Optional[
     return kmeans, labels
 
 
-def _run_kmeans_faiss(embeddings: np.ndarray, n_clusters: int, seed: Optional[int] = None, n_workers: int = 1):
-    """KMeans using FAISS backend for faster clustering."""
-    try:
-        import faiss
-        
-        # Ensure embeddings are float32 and C-contiguous (FAISS requirement)
-        embeddings = np.ascontiguousarray(embeddings.astype(np.float32))
-        
-        n_samples, d = embeddings.shape
-        
-        # Set number of threads for FAISS
-        if n_workers > 1:
-            faiss.omp_set_num_threads(n_workers)
-        
-        # Create FAISS KMeans object
-        kmeans = faiss.Clustering(d, n_clusters)
-        
-        # Set clustering parameters
-        kmeans.verbose = False
-        kmeans.niter = 20  # Number of iterations
-        kmeans.nredo = 1   # Number of redos
-        if seed is not None:
-            kmeans.seed = seed
-        
-        # Use L2 distance (equivalent to sklearn's default)
-        index = faiss.IndexFlatL2(d)
-        
-        # Run clustering
-        kmeans.train(embeddings, index)
-        
-        # Get centroids
-        centroids = faiss.vector_to_array(kmeans.centroids).reshape(n_clusters, d)
-        
-        # Assign labels by finding nearest centroid for each point
-        _, labels = index.search(embeddings, 1)
-        labels = labels.flatten()
-        
-        # Create a simple object to mimic sklearn KMeans interface
-        class FAISSKMeans:
-            def __init__(self, centroids, labels):
-                self.cluster_centers_ = centroids
-                self.labels_ = labels
-                self.n_clusters = len(centroids)
-        
-        return FAISSKMeans(centroids, labels), labels
-        
-    except Exception as e:
-        # Fallback to sklearn if FAISS fails
-        logger.warning(f"FAISS clustering failed ({e}), falling back to sklearn")
-        return _run_kmeans_sklearn(embeddings, n_clusters, seed)
-
 

From 868fb21ea3ae5c33e2c7c1e98ce48e7df7cca160 Mon Sep 17 00:00:00 2001
From: Net Zhang <zhang.11091@osu.edu>
Date: Tue, 2 Jun 2026 11:37:56 -0400
Subject: [PATCH 2/6] Auto-enable `scikit-learn-intelex` for CPU `sklearn`
 accel

Add `scikit-learn-intelex` as default dependency and patch sklearn at
import time in `shared/utils/clustering.py`. Accelerates the existing
`sklearn` PCA / TSNE / KMeans calls on CPU.

UMAP is unaffected as `umap-learn` is not part of the `sklearn`
algorithm.

Set Set EMB_EXPLORER_DISABLE_SKLEARNEX=1 to opt out for debugging
vanilla sklearn behavior.
---
 README.md                  |  2 +-
 docs/BACKEND_PIPELINE.md   | 10 +++++-----
 pyproject.toml             |  3 +++
 shared/utils/clustering.py | 12 ++++++++++++
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 4e96445..d0c7d1a 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ uv pip install -e ".[gpu-cu12]"
 uv pip install -e ".[gpu-cu13]"
 ```
 
-The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. You can also manually select backends (cuML, sklearn) in the sidebar.
+The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. The CPU sklearn path is auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) (Intel oneDAL). You can also manually select backends (cuML, sklearn) in the sidebar.
 
 ## Usage
 
diff --git a/docs/BACKEND_PIPELINE.md b/docs/BACKEND_PIPELINE.md
index 0f298d8..8a35c23 100644
--- a/docs/BACKEND_PIPELINE.md
+++ b/docs/BACKEND_PIPELINE.md
@@ -12,11 +12,11 @@ Raw Embeddings (from parquet or model)
   ├─ L2 Normalize: project onto unit hypersphere
   │
   ├─► Step 1: KMeans Clustering (high-dimensional)
-  │     Backend: cuML (GPU) → sklearn (CPU)
+  │     Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by sklearn-intelex)
   │
   ├─► Step 2: Dimensionality Reduction to 2D
   │     Method:  PCA / t-SNE / UMAP
-  │     Backend: cuML (GPU) → sklearn (CPU)
+  │     Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by sklearn-intelex for PCA/TSNE)
   │
   └─► Scatter Plot (Altair)
         Color = cluster, position = 2D projection
@@ -46,7 +46,7 @@ feature space, not a lossy 2D projection.
 | Backend | When It's Used | How It Works |
 |---------|---------------|--------------|
 | **cuML** | GPU available + >500 samples | GPU-accelerated KMeans via RAPIDS. Runs on CuPy arrays. Falls back to sklearn on any error. |
-| **sklearn** | CPU path (default on machines without a GPU) | Standard scikit-learn KMeans. Always works, no special dependencies. |
+| **sklearn** | CPU path (default on machines without a GPU) | Standard scikit-learn KMeans, auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) (Intel oneDAL) when installed — typically 10–17× faster than vanilla sklearn on CPU. Disable with `EMB_EXPLORER_DISABLE_SKLEARNEX=1`. |
 
 **Auto-selection priority:** cuML > sklearn. You can override in the sidebar.
 
@@ -95,8 +95,8 @@ When you select "auto" (the default), the app picks the fastest available backen
 
 | Operation | Auto Logic |
 |-----------|-----------|
-| KMeans | cuML if GPU + >500 samples, else sklearn |
-| Dim. Reduction | cuML if GPU + >5000 samples, else sklearn |
+| KMeans | cuML if GPU + >500 samples, else sklearn (auto-accelerated by sklearn-intelex when installed) |
+| Dim. Reduction | cuML if GPU + >5000 samples, else sklearn (auto-accelerated by sklearn-intelex for PCA / t-SNE) |
 
 Any GPU error (architecture mismatch, missing libraries, out of memory (OOM)) triggers an
 automatic retry with sklearn. OOM errors are surfaced to the user with guidance.
diff --git a/pyproject.toml b/pyproject.toml
index c793896..0551f5f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,9 @@ dependencies = [
     "altair>=5.0.0",
     # Machine learning
     "scikit-learn>=1.0.0",
+    # Intel oneDAL acceleration for sklearn (PCA / TSNE / KMeans) auto-patched at runtime.
+    # Disable with EMB_EXPLORER_DISABLE_SKLEARNEX=1 if you need vanilla sklearn behavior for debugging.
+    "scikit-learn-intelex>=2025.0",
     "umap-learn>=0.5.0",
     "numba>=0.57.0",
     # Vision-language models
diff --git a/shared/utils/clustering.py b/shared/utils/clustering.py
index ad9fc2e..4f85e83 100644
--- a/shared/utils/clustering.py
+++ b/shared/utils/clustering.py
@@ -14,6 +14,18 @@
 
 logger = get_logger(__name__)
 
+# Auto-enable scikit-learn-intelex (Intel oneDAL) acceleration for sklearn's
+# PCA / TSNE / KMeans on CPU. Patches sklearn at import time so any downstream
+# sklearn call gets the accelerated path transparently. Disable for debugging
+# vanilla sklearn behavior with: EMB_EXPLORER_DISABLE_SKLEARNEX=1
+if os.environ.get("EMB_EXPLORER_DISABLE_SKLEARNEX", "0") != "1":
+    try:
+        from sklearnex import patch_sklearn
+        patch_sklearn()
+        logger.info("scikit-learn-intelex enabled (CPU sklearn auto-accelerated)")
+    except ImportError:
+        logger.debug("scikit-learn-intelex not installed; using vanilla sklearn")
+
 # Legacy module-level flags — now backed by lightweight find_spec() checks
 # so importing this module no longer triggers heavy library loads.
 # Functions that actually need the libraries import them locally.

From 788ec60c4d66ef2504a6c8bac5f93c5e29ce4918 Mon Sep 17 00:00:00 2001
From: Net Zhang <48858129+NetZissou@users.noreply.github.com>
Date: Wed, 3 Jun 2026 13:05:01 -0400
Subject: [PATCH 3/6] Added footnote for `sklearn-intelex` in README &
 BACKEND_PIPELINE

Co-authored-by: Net Zhang <48858129+NetZissou@users.noreply.github.com>
---
 README.md                | 4 +++-
 docs/BACKEND_PIPELINE.md | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index d0c7d1a..7b8f58a 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ uv pip install -e ".[gpu-cu12]"
 uv pip install -e ".[gpu-cu13]"
 ```
 
-The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. The CPU sklearn path is auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) (Intel oneDAL). You can also manually select backends (cuML, sklearn) in the sidebar.
+The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. The CPU sklearn path is auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) [^1]. You can also manually select backends (`cuML`, `sklearn`) in the sidebar.
 
 ## Usage
 
@@ -96,3 +96,5 @@ ssh -N -L 8501:<COMPUTE_NODE>:8501 <USER>@<LOGIN_NODE>
 ## Acknowledgements
 
 [OpenCLIP](https://github.com/mlfoundations/open_clip) | [Streamlit](https://streamlit.io/) | [Altair](https://altair-viz.github.io/)
+
+[^1]: [`sklearn-intelex`](https://github.com/uxlfoundation/scikit-learn-intelex) is powered by the [oneDAL](https://github.com/uxlfoundation/oneDAL) library that provides accelerations on x86_64 Linux and Windows machines, and silently fall back to vanilla `sklearn` on unsupported architectures like Apple Silicon and ARM Linux. The package is under the [UXL Foundation](https://github.com/uxlfoundation) (a Linux Foundation project) so cross-vendor support is a stated goal.
diff --git a/docs/BACKEND_PIPELINE.md b/docs/BACKEND_PIPELINE.md
index 8a35c23..1ad8071 100644
--- a/docs/BACKEND_PIPELINE.md
+++ b/docs/BACKEND_PIPELINE.md
@@ -12,11 +12,11 @@ Raw Embeddings (from parquet or model)
   ├─ L2 Normalize: project onto unit hypersphere
   │
   ├─► Step 1: KMeans Clustering (high-dimensional)
-  │     Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by sklearn-intelex)
+  │     Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex` [^1])
   │
   ├─► Step 2: Dimensionality Reduction to 2D
   │     Method:  PCA / t-SNE / UMAP
-  │     Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by sklearn-intelex for PCA/TSNE)
+  │     Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex` for PCA/TSNE [^1])
   │
   └─► Scatter Plot (Altair)
         Color = cluster, position = 2D projection
@@ -95,7 +95,7 @@ When you select "auto" (the default), the app picks the fastest available backen
 
 | Operation | Auto Logic |
 |-----------|-----------|
-| KMeans | cuML if GPU + >500 samples, else sklearn (auto-accelerated by sklearn-intelex when installed) |
+| KMeans | cuML if GPU + >500 samples, else sklearn (auto-accelerated by `sklearn-intelex` when installed [^1]) |
 | Dim. Reduction | cuML if GPU + >5000 samples, else sklearn (auto-accelerated by sklearn-intelex for PCA / t-SNE) |
 
 Any GPU error (architecture mismatch, missing libraries, out of memory (OOM)) triggers an
@@ -126,3 +126,5 @@ sklearn (CPU, always works)
 
 The app is designed to *always produce a result*. GPU acceleration is a
 nice-to-have, never a hard requirement.
+
+[^1]: [`sklearn-intelex`](https://github.com/uxlfoundation/scikit-learn-intelex) is powered by the [oneDAL](https://github.com/uxlfoundation/oneDAL) library that provides accelerations on x86_64 Linux and Windows machines, and silently fall back to vanilla `sklearn` on unsupported architectures like Apple Silicon and ARM Linux. The package is under the [UXL Foundation](https://github.com/uxlfoundation) (a Linux Foundation project) so cross-vendor support is a stated goal.

From 133e9bcfa0b9576e9fad2669e310c858350d1208 Mon Sep 17 00:00:00 2001
From: egrace479 <e.campolongo479@gmail.com>
Date: Mon, 8 Jun 2026 15:49:30 -0400
Subject: [PATCH 4/6] fix for optional extension install only on os that can
 use it

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0551f5f..2f2f4b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,7 +43,7 @@ dependencies = [
     "scikit-learn>=1.0.0",
     # Intel oneDAL acceleration for sklearn (PCA / TSNE / KMeans) auto-patched at runtime.
     # Disable with EMB_EXPLORER_DISABLE_SKLEARNEX=1 if you need vanilla sklearn behavior for debugging.
-    "scikit-learn-intelex>=2025.0",
+    "scikit-learn-intelex>=2025.0; platform_machine == 'AMD64' or platform_machine == 'x86_64'",
     "umap-learn>=0.5.0",
     "numba>=0.57.0",
     # Vision-language models

From 3d6e63b7d44d6b87955c89deea427281731fe903 Mon Sep 17 00:00:00 2001
From: Net Zhang <zhang.11091@osu.edu>
Date: Tue, 9 Jun 2026 10:03:31 -0400
Subject: [PATCH 5/6] Apply footnote format changes

Co-Authored-By: egrace479 <egrace479@users.noreply.github.com>
---
 README.md                |  2 +-
 docs/BACKEND_PIPELINE.md | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 7b8f58a..2cff7ad 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ uv pip install -e ".[gpu-cu12]"
 uv pip install -e ".[gpu-cu13]"
 ```
 
-The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. The CPU sklearn path is auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) [^1]. You can also manually select backends (`cuML`, `sklearn`) in the sidebar.
+The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. The CPU sklearn path is auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex)[^1]. You can also manually select backends (`cuML`, `sklearn`) in the sidebar.
 
 ## Usage
 
diff --git a/docs/BACKEND_PIPELINE.md b/docs/BACKEND_PIPELINE.md
index 1ad8071..92c2b4c 100644
--- a/docs/BACKEND_PIPELINE.md
+++ b/docs/BACKEND_PIPELINE.md
@@ -12,16 +12,18 @@ Raw Embeddings (from parquet or model)
   ├─ L2 Normalize: project onto unit hypersphere
   │
   ├─► Step 1: KMeans Clustering (high-dimensional)
-  │     Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex` [^1])
+  │     Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex`)
   │
   ├─► Step 2: Dimensionality Reduction to 2D
   │     Method:  PCA / t-SNE / UMAP
-  │     Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex` for PCA/TSNE [^1])
+  │     Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex` for PCA/TSNE)
   │
   └─► Scatter Plot (Altair)
         Color = cluster, position = 2D projection
 ```
 
+Note that `sklearn-intelex` acceleration is used for CPU operations where available[^1].
+
 ## Step 0: Embedding Preparation
 
 Before any computation, every embedding goes through `_prepare_embeddings()`:
@@ -46,7 +48,7 @@ feature space, not a lossy 2D projection.
 | Backend | When It's Used | How It Works |
 |---------|---------------|--------------|
 | **cuML** | GPU available + >500 samples | GPU-accelerated KMeans via RAPIDS. Runs on CuPy arrays. Falls back to sklearn on any error. |
-| **sklearn** | CPU path (default on machines without a GPU) | Standard scikit-learn KMeans, auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) (Intel oneDAL) when installed — typically 10–17× faster than vanilla sklearn on CPU. Disable with `EMB_EXPLORER_DISABLE_SKLEARNEX=1`. |
+| **sklearn** | CPU path (default on machines without a GPU) | Standard scikit-learn KMeans, auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) (Intel oneDAL) when installed[^1] — typically 10–17× faster than vanilla sklearn on CPU. Disable with `EMB_EXPLORER_DISABLE_SKLEARNEX=1`. |
 
 **Auto-selection priority:** cuML > sklearn. You can override in the sidebar.
 
@@ -95,7 +97,7 @@ When you select "auto" (the default), the app picks the fastest available backen
 
 | Operation | Auto Logic |
 |-----------|-----------|
-| KMeans | cuML if GPU + >500 samples, else sklearn (auto-accelerated by `sklearn-intelex` when installed [^1]) |
+| KMeans | cuML if GPU + >500 samples, else sklearn (auto-accelerated by `sklearn-intelex` when installed[^1]) |
 | Dim. Reduction | cuML if GPU + >5000 samples, else sklearn (auto-accelerated by sklearn-intelex for PCA / t-SNE) |
 
 Any GPU error (architecture mismatch, missing libraries, out of memory (OOM)) triggers an

From a021cc8cc063893b0ba19b443601c7030ee4b614 Mon Sep 17 00:00:00 2001
From: Net Zhang <zhang.11091@osu.edu>
Date: Tue, 9 Jun 2026 10:41:18 -0400
Subject: [PATCH 6/6] Add CI Matrix to test install & import on OS + Py Matrix

Matrix:
- OS: ubuntu-latest, windows-latest, macos-latest
- Python: 3.10, 3.11, 3.12, 3.13

Each cell installs the CPU base (no gpu extras), runs an import smoke
tests for shared/utils/clustering and both Streamlit app entry points, then verifies the sklearnex platform marker: present on x86_64/AMD64, absent on arm64.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/install-and-import.yaml | 59 +++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 .github/workflows/install-and-import.yaml

diff --git a/.github/workflows/install-and-import.yaml b/.github/workflows/install-and-import.yaml
new file mode 100644
index 0000000..a188136
--- /dev/null
+++ b/.github/workflows/install-and-import.yaml
@@ -0,0 +1,59 @@
+name: install + import
+
+# CI matrix: verify the package installs cleanly and core modules
+# import on every supported (OS, Python) pair, per pkg pyproject.toml's
+# requires-python = ">=3.10,<3.14".
+on:
+  pull_request:
+  workflow_dispatch:  # allow manual reruns from the Actions tab
+
+jobs:
+  install-and-import:
+    name: ${{ matrix.os }} / py${{ matrix.python-version }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      # Don't abort the whole matrix on a single failure so we can see all platforms.
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        python-version: ['3.10', '3.11', '3.12', '3.13']
+    steps:
+      - uses: actions/checkout@v4
+
+      # uv is our package manager of record, mirrors what users do locally.
+      - uses: astral-sh/setup-uv@v6
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      # CPU-only install. gpu-cu12 / gpu-cu13 extras are skipped on CI runners
+      # (no NVIDIA hardware, CUDA wheels are large and slow to resolve).
+      - name: Install package
+        run: uv pip install --system -e .
+
+      # Smoke-test that the modules that get patched or accelerated import
+      # cleanly, and that both Streamlit app entry points are importable.
+      - name: Import smoke test
+        run: |
+          python -c "import shared.utils.clustering"
+          python -c "import apps.precalculated.app"
+          python -c "import apps.embed_explore.app"
+
+      # On x86_64 / AMD64 sklearnex must install (the platform marker in
+      # pyproject.toml ensures it). On macos-latest (arm64) it must NOT
+      # install, which is the whole point of the marker.
+      - name: Verify sklearnex matches platform marker
+        shell: python
+        run: |
+          import platform
+          import importlib.util
+          on_x86 = platform.machine() in ('x86_64', 'AMD64')
+          present = importlib.util.find_spec('sklearnex') is not None
+          if on_x86:
+              assert present, 'sklearnex should be installed on x86_64/AMD64'
+              import sklearnex
+              print(f'sklearnex {sklearnex.__version__} present on {platform.machine()}')
+          else:
+              assert not present, f'sklearnex must not install on {platform.machine()}'
+              print(f'sklearnex correctly absent on {platform.machine()}')