diff --git a/.github/workflows/install-and-import.yaml b/.github/workflows/install-and-import.yaml
new file mode 100644
index 0000000..a188136
--- /dev/null
+++ b/.github/workflows/install-and-import.yaml
@@ -0,0 +1,59 @@
+name: install + import
+
+# CI matrix: verify the package installs cleanly and core modules
+# import on every supported (OS, Python) pair, per pkg pyproject.toml's
+# requires-python = ">=3.10,<3.14".
+on:
+  pull_request:
+  workflow_dispatch:  # allow manual reruns from the Actions tab
+
+jobs:
+  install-and-import:
+    name: ${{ matrix.os }} / py${{ matrix.python-version }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      # Don't abort the whole matrix on a single failure so we can see all platforms.
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        python-version: ['3.10', '3.11', '3.12', '3.13']
+    steps:
+      - uses: actions/checkout@v4
+
+      # uv is our package manager of record, mirrors what users do locally.
+      - uses: astral-sh/setup-uv@v6
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      # CPU-only install. gpu-cu12 / gpu-cu13 extras are skipped on CI runners
+      # (no NVIDIA hardware, CUDA wheels are large and slow to resolve).
+      - name: Install package
+        run: uv pip install --system -e .
+
+      # Smoke-test that the modules that get patched or accelerated import
+      # cleanly, and that both Streamlit app entry points are importable.
+      - name: Import smoke test
+        run: |
+          python -c "import shared.utils.clustering"
+          python -c "import apps.precalculated.app"
+          python -c "import apps.embed_explore.app"
+
+      # On x86_64 / AMD64 sklearnex must install (the platform marker in
+      # pyproject.toml ensures it). On macos-latest (arm64) it must NOT
+      # install, which is the whole point of the marker.
+      - name: Verify sklearnex matches platform marker
+        shell: python
+        run: |
+          import platform
+          import importlib.util
+          on_x86 = platform.machine() in ('x86_64', 'AMD64')
+          present = importlib.util.find_spec('sklearnex') is not None
+          if on_x86:
+              assert present, 'sklearnex should be installed on x86_64/AMD64'
+              import sklearnex
+              print(f'sklearnex {sklearnex.__version__} present on {platform.machine()}')
+          else:
+              assert not present, f'sklearnex must not install on {platform.machine()}'
+              print(f'sklearnex correctly absent on {platform.machine()}')
diff --git a/README.md b/README.md
index 127a842..2cff7ad 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ uv pip install -e ".[gpu-cu12]"
 uv pip install -e ".[gpu-cu13]"
 ```
 
-The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. You can also manually select backends (cuML, FAISS, sklearn) in the sidebar.
+The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. The CPU sklearn path is auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex)[^1]. You can also manually select backends (`cuML`, `sklearn`) in the sidebar.
 
 ## Usage
 
@@ -96,3 +96,5 @@ ssh -N -L 8501:<COMPUTE_NODE>:8501 <USER>@<LOGIN_NODE>
 ## Acknowledgements
 
 [OpenCLIP](https://github.com/mlfoundations/open_clip) | [Streamlit](https://streamlit.io/) | [Altair](https://altair-viz.github.io/)
+
+[^1]: [`sklearn-intelex`](https://github.com/uxlfoundation/scikit-learn-intelex) is powered by the [oneDAL](https://github.com/uxlfoundation/oneDAL) library that provides accelerations on x86_64 Linux and Windows machines, and silently fall back to vanilla `sklearn` on unsupported architectures like Apple Silicon and ARM Linux. The package is under the [UXL Foundation](https://github.com/uxlfoundation) (a Linux Foundation project) so cross-vendor support is a stated goal.
diff --git a/docs/BACKEND_PIPELINE.md b/docs/BACKEND_PIPELINE.md
index 43c1209..92c2b4c 100644
--- a/docs/BACKEND_PIPELINE.md
+++ b/docs/BACKEND_PIPELINE.md
@@ -12,16 +12,18 @@ Raw Embeddings (from parquet or model)
   ├─ L2 Normalize: project onto unit hypersphere
   │
   ├─► Step 1: KMeans Clustering (high-dimensional)
-  │     Backend: cuML → FAISS → sklearn
+  │     Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex`)
   │
   ├─► Step 2: Dimensionality Reduction to 2D
   │     Method:  PCA / t-SNE / UMAP
-  │     Backend: cuML → sklearn
+  │     Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex` for PCA/TSNE)
   │
   └─► Scatter Plot (Altair)
         Color = cluster, position = 2D projection
 ```
 
+Note that `sklearn-intelex` acceleration is used for CPU operations where available[^1].
+
 ## Step 0: Embedding Preparation
 
 Before any computation, every embedding goes through `_prepare_embeddings()`:
@@ -46,10 +48,9 @@ feature space, not a lossy 2D projection.
 | Backend | When It's Used | How It Works |
 |---------|---------------|--------------|
 | **cuML** | GPU available + >500 samples | GPU-accelerated KMeans via RAPIDS. Runs on CuPy arrays. Falls back to sklearn on any error. |
-| **FAISS** | No GPU + >500 samples | Facebook's optimized CPU KMeans using L2 index. Fast for medium datasets. Falls back to sklearn on error. |
-| **sklearn** | Small datasets or fallback | Standard scikit-learn KMeans. Always works, no special dependencies. |
+| **sklearn** | CPU path (default on machines without a GPU) | Standard scikit-learn KMeans, auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) (Intel oneDAL) when installed[^1] — typically 10–17× faster than vanilla sklearn on CPU. Disable with `EMB_EXPLORER_DISABLE_SKLEARNEX=1`. |
 
-**Auto-selection priority:** cuML > FAISS > sklearn. You can override in the sidebar.
+**Auto-selection priority:** cuML > sklearn. You can override in the sidebar.
 
 ## Step 2: Dimensionality Reduction
 
@@ -96,8 +97,8 @@ When you select "auto" (the default), the app picks the fastest available backen
 
 | Operation | Auto Logic |
 |-----------|-----------|
-| KMeans | cuML if GPU + >500 samples, else FAISS if available + >500 samples, else sklearn |
-| Dim. Reduction | cuML if GPU + >5000 samples, else sklearn |
+| KMeans | cuML if GPU + >500 samples, else sklearn (auto-accelerated by `sklearn-intelex` when installed[^1]) |
+| Dim. Reduction | cuML if GPU + >5000 samples, else sklearn (auto-accelerated by sklearn-intelex for PCA / t-SNE) |
 
 Any GPU error (architecture mismatch, missing libraries, out of memory (OOM)) triggers an
 automatic retry with sklearn. OOM errors are surfaced to the user with guidance.
@@ -122,11 +123,10 @@ Check the log file for the full picture when debugging.
 cuML (GPU)
   │ error?
   ▼
-FAISS (CPU, optimized)     ← KMeans only
-  │ error?
-  ▼
 sklearn (CPU, always works)
 ```
 
 The app is designed to *always produce a result*. GPU acceleration is a
 nice-to-have, never a hard requirement.
+
+[^1]: [`sklearn-intelex`](https://github.com/uxlfoundation/scikit-learn-intelex) is powered by the [oneDAL](https://github.com/uxlfoundation/oneDAL) library that provides accelerations on x86_64 Linux and Windows machines, and silently fall back to vanilla `sklearn` on unsupported architectures like Apple Silicon and ARM Linux. The package is under the [UXL Foundation](https://github.com/uxlfoundation) (a Linux Foundation project) so cross-vendor support is a stated goal.
diff --git a/pyproject.toml b/pyproject.toml
index 24f89af..2f2f4b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,9 +41,11 @@ dependencies = [
     "altair>=5.0.0",
     # Machine learning
     "scikit-learn>=1.0.0",
+    # Intel oneDAL acceleration for sklearn (PCA / TSNE / KMeans) auto-patched at runtime.
+    # Disable with EMB_EXPLORER_DISABLE_SKLEARNEX=1 if you need vanilla sklearn behavior for debugging.
+    "scikit-learn-intelex>=2025.0; platform_machine == 'AMD64' or platform_machine == 'x86_64'",
     "umap-learn>=0.5.0",
-    "numba>=0.57.0", 
-    "faiss-cpu>=1.7.0",
+    "numba>=0.57.0",
     # Vision-language models
     "open-clip-torch>=2.20.0",
     # Custom inference package
@@ -69,20 +71,17 @@ gpu = [
 ]
 gpu-cu12 = [
     "torch>=2.0.0",
-    "cuml-cu12>=25.6",
-    "faiss-gpu-cu12>=1.11.0",
+    "cuml-cu12>=26.4",  # 26.4 removed the sklearn upper bound (compatible with sklearn>=1.8)
     "pynvml>=11.0.0",
 ]
 gpu-cu13 = [
     "torch>=2.0.0",
-    "cuml-cu13>=25.12",
-    "faiss-gpu-cu12>=1.11.0",  # no cu13 build on PyPI; cu12 works via CUDA backward compat
+    "cuml-cu13>=26.4",  # 26.4 removed the sklearn upper bound (compatible with sklearn>=1.8)
     "pynvml>=11.0.0",
 ]
-# Minimal GPU support (just PyTorch + FAISS GPU, no RAPIDS)
+# Minimal GPU support for image embeddings generation (just PyTorch, no RAPIDS)
 gpu-minimal = [
     "torch>=2.0.0",
-    "faiss-gpu-cu12>=1.11.0",
 ]
 all = [
     "emb-explorer[dev,gpu]",
diff --git a/shared/components/clustering_controls.py b/shared/components/clustering_controls.py
index d18eb5d..34d3e1c 100644
--- a/shared/components/clustering_controls.py
+++ b/shared/components/clustering_controls.py
@@ -5,7 +5,7 @@
 import streamlit as st
 from typing import Tuple, Optional
 
-from shared.utils.backend import HAS_FAISS_PACKAGE, HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE
+from shared.utils.backend import HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE
 
 
 def render_clustering_backend_controls():
@@ -19,9 +19,6 @@ def render_clustering_backend_controls():
     dim_reduction_options = ["auto", "sklearn"]
     clustering_options = ["auto", "sklearn"]
 
-    if HAS_FAISS_PACKAGE:
-        clustering_options.append("faiss")
-
     if HAS_CUML_PACKAGE and HAS_CUPY_PACKAGE:
         dim_reduction_options.append("cuml")
         clustering_options.append("cuml")
@@ -73,7 +70,7 @@ def render_clustering_backend_controls():
                 max_value=64, 
                 value=8, 
                 step=1,
-                help="Number of parallel workers for CPU backends (sklearn, FAISS). Not used by cuML (GPU manages parallelization automatically)."
+                help="Number of parallel workers for CPU sklearn. Not used by cuML (GPU manages parallelization automatically)."
             )
         
     
@@ -118,8 +115,6 @@ def render_kmeans_controls():
         Tuple of (clustering_backend, n_workers, seed)
     """
     clustering_options = ["auto", "sklearn"]
-    if HAS_FAISS_PACKAGE:
-        clustering_options.append("faiss")
     if HAS_CUML_PACKAGE and HAS_CUPY_PACKAGE:
         clustering_options.append("cuml")
 
diff --git a/shared/services/clustering_service.py b/shared/services/clustering_service.py
index 150bcfc..6847b9f 100644
--- a/shared/services/clustering_service.py
+++ b/shared/services/clustering_service.py
@@ -38,8 +38,8 @@ def run_clustering(
             n_clusters: Number of clusters
             reduction_method: Dimensionality reduction method
             n_workers: Number of workers for reduction
-            dim_reduction_backend: Backend for dimensionality reduction ("auto", "sklearn", "faiss", "cuml")
-            clustering_backend: Backend for clustering ("auto", "sklearn", "faiss", "cuml")
+            dim_reduction_backend: Backend for dimensionality reduction ("auto", "sklearn", "cuml")
+            clustering_backend: Backend for clustering ("auto", "sklearn", "cuml")
             seed: Random seed for reproducibility (None for random)
 
         Returns:
diff --git a/shared/utils/__init__.py b/shared/utils/__init__.py
index b305aa7..2ae7a57 100644
--- a/shared/utils/__init__.py
+++ b/shared/utils/__init__.py
@@ -2,7 +2,7 @@
 Shared utilities for clustering, IO, models, and taxonomy.
 
 Modules are imported lazily to avoid pulling in heavy dependencies
-(sklearn, umap, faiss, cuml, torch, open_clip) at startup.
+(sklearn, umap, cuml, torch, open_clip) at startup.
 Use direct imports instead:
 
     from shared.utils.clustering import reduce_dim, run_kmeans
diff --git a/shared/utils/backend.py b/shared/utils/backend.py
index ed66cad..c75db3f 100644
--- a/shared/utils/backend.py
+++ b/shared/utils/backend.py
@@ -20,7 +20,6 @@
 # These are safe to call at module-load / render time — they only check
 # whether the package is installed, without executing it.
 
-HAS_FAISS_PACKAGE: bool = importlib.util.find_spec("faiss") is not None
 HAS_CUML_PACKAGE: bool = importlib.util.find_spec("cuml") is not None
 HAS_CUPY_PACKAGE: bool = importlib.util.find_spec("cupy") is not None
 HAS_TORCH_PACKAGE: bool = importlib.util.find_spec("torch") is not None
@@ -84,27 +83,16 @@ def check_cuml_available() -> bool:
         return False
 
 
-def check_faiss_available() -> bool:
-    """Check if FAISS is available (actual import, for runtime use)."""
-    if not HAS_FAISS_PACKAGE:
-        return False
-    try:
-        import faiss
-        return True
-    except ImportError:
-        return False
-
-
 def resolve_backend(backend: str, operation: str = "general") -> str:
     """
     Resolve 'auto' backend to actual backend based on available hardware.
 
     Args:
-        backend: Requested backend ("auto", "sklearn", "cuml", "faiss")
+        backend: Requested backend ("auto", "sklearn", "cuml")
         operation: Operation type for logging ("clustering", "reduction", "general")
 
     Returns:
-        Resolved backend name
+        Resolved backend name. CPU paths always go through sklearn.
     """
     if backend != "auto":
         logger.debug(f"Using explicitly requested backend: {backend}")
@@ -112,14 +100,10 @@ def resolve_backend(backend: str, operation: str = "general") -> str:
 
     cuda_available, device_info = check_cuda_available()
     has_cuml = check_cuml_available()
-    has_faiss = check_faiss_available()
 
     if cuda_available and has_cuml:
         resolved = "cuml"
         logger.info(f"Auto-resolved {operation} backend to cuML (GPU: {device_info})")
-    elif has_faiss:
-        resolved = "faiss"
-        logger.info(f"Auto-resolved {operation} backend to FAISS (CPU)")
     else:
         resolved = "sklearn"
         logger.info(f"Auto-resolved {operation} backend to sklearn (CPU)")
@@ -140,7 +124,6 @@ def get_backend_info() -> dict:
         "cuda_available": cuda_available,
         "device_info": device_info,
         "cuml_available": check_cuml_available(),
-        "faiss_available": check_faiss_available(),
     }
 
 
diff --git a/shared/utils/clustering.py b/shared/utils/clustering.py
index f144f4b..4f85e83 100644
--- a/shared/utils/clustering.py
+++ b/shared/utils/clustering.py
@@ -8,16 +8,27 @@
 
 from shared.utils.logging_config import get_logger
 from shared.utils.backend import (
-    HAS_FAISS_PACKAGE, HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE,
-    check_cuda_available, check_cuml_available, check_faiss_available,
+    HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE,
+    check_cuda_available, check_cuml_available,
 )
 
 logger = get_logger(__name__)
 
+# Auto-enable scikit-learn-intelex (Intel oneDAL) acceleration for sklearn's
+# PCA / TSNE / KMeans on CPU. Patches sklearn at import time so any downstream
+# sklearn call gets the accelerated path transparently. Disable for debugging
+# vanilla sklearn behavior with: EMB_EXPLORER_DISABLE_SKLEARNEX=1
+if os.environ.get("EMB_EXPLORER_DISABLE_SKLEARNEX", "0") != "1":
+    try:
+        from sklearnex import patch_sklearn
+        patch_sklearn()
+        logger.info("scikit-learn-intelex enabled (CPU sklearn auto-accelerated)")
+    except ImportError:
+        logger.debug("scikit-learn-intelex not installed; using vanilla sklearn")
+
 # Legacy module-level flags — now backed by lightweight find_spec() checks
 # so importing this module no longer triggers heavy library loads.
 # Functions that actually need the libraries import them locally.
-HAS_FAISS: bool = HAS_FAISS_PACKAGE
 HAS_CUML: bool = HAS_CUML_PACKAGE and HAS_CUPY_PACKAGE
 HAS_CUDA: bool = False  # resolved lazily via check_cuda_available()
 
@@ -342,8 +353,8 @@ def run_kmeans(embeddings: np.ndarray, n_clusters: int, seed: Optional[int] = No
         embeddings (np.ndarray): The input feature embeddings of shape (n_samples, n_features).
         n_clusters (int): The number of clusters to form.
         seed (int, optional): Random seed for reproducibility. Defaults to None (random).
-        n_workers (int, optional): Number of parallel workers (used by FAISS and cuML if available).
-        backend (str, optional): Clustering backend - "auto", "sklearn", "faiss", or "cuml". Defaults to "auto".
+        n_workers (int, optional): Number of parallel workers (used by cuML if available).
+        backend (str, optional): Clustering backend - "auto", "sklearn", or "cuml". Defaults to "auto".
 
     Returns:
         kmeans (KMeans or custom object): The fitted clustering object.
@@ -362,20 +373,9 @@ def run_kmeans(embeddings: np.ndarray, n_clusters: int, seed: Optional[int] = No
     if backend == "cuml" and HAS_CUML and cuda_available:
         logger.info("Using cuML backend for KMeans")
         result = _run_kmeans_cuml(embeddings, n_clusters, seed, n_workers)
-    elif backend == "faiss" and HAS_FAISS:
-        logger.info("Using FAISS backend for KMeans")
-        result = _run_kmeans_faiss(embeddings, n_clusters, seed, n_workers)
-    elif backend == "auto":
-        # Auto selection priority: cuML > FAISS > sklearn
-        if HAS_CUML and cuda_available and n_samples > 500:
-            logger.info("Auto-selected cuML backend for KMeans (GPU available, large dataset)")
-            result = _run_kmeans_cuml(embeddings, n_clusters, seed, n_workers)
-        elif HAS_FAISS and n_samples > 500:
-            logger.info("Auto-selected FAISS backend for KMeans (large dataset)")
-            result = _run_kmeans_faiss(embeddings, n_clusters, seed, n_workers)
-        else:
-            logger.info("Using sklearn backend for KMeans")
-            result = _run_kmeans_sklearn(embeddings, n_clusters, seed)
+    elif backend == "auto" and HAS_CUML and cuda_available and n_samples > 500:
+        logger.info("Auto-selected cuML backend for KMeans (GPU available, large dataset)")
+        result = _run_kmeans_cuml(embeddings, n_clusters, seed, n_workers)
     else:
         logger.info("Using sklearn backend for KMeans")
         result = _run_kmeans_sklearn(embeddings, n_clusters, seed)
@@ -443,55 +443,4 @@ def _run_kmeans_sklearn(embeddings: np.ndarray, n_clusters: int, seed: Optional[
     return kmeans, labels
 
 
-def _run_kmeans_faiss(embeddings: np.ndarray, n_clusters: int, seed: Optional[int] = None, n_workers: int = 1):
-    """KMeans using FAISS backend for faster clustering."""
-    try:
-        import faiss
-        
-        # Ensure embeddings are float32 and C-contiguous (FAISS requirement)
-        embeddings = np.ascontiguousarray(embeddings.astype(np.float32))
-        
-        n_samples, d = embeddings.shape
-        
-        # Set number of threads for FAISS
-        if n_workers > 1:
-            faiss.omp_set_num_threads(n_workers)
-        
-        # Create FAISS KMeans object
-        kmeans = faiss.Clustering(d, n_clusters)
-        
-        # Set clustering parameters
-        kmeans.verbose = False
-        kmeans.niter = 20  # Number of iterations
-        kmeans.nredo = 1   # Number of redos
-        if seed is not None:
-            kmeans.seed = seed
-        
-        # Use L2 distance (equivalent to sklearn's default)
-        index = faiss.IndexFlatL2(d)
-        
-        # Run clustering
-        kmeans.train(embeddings, index)
-        
-        # Get centroids
-        centroids = faiss.vector_to_array(kmeans.centroids).reshape(n_clusters, d)
-        
-        # Assign labels by finding nearest centroid for each point
-        _, labels = index.search(embeddings, 1)
-        labels = labels.flatten()
-        
-        # Create a simple object to mimic sklearn KMeans interface
-        class FAISSKMeans:
-            def __init__(self, centroids, labels):
-                self.cluster_centers_ = centroids
-                self.labels_ = labels
-                self.n_clusters = len(centroids)
-        
-        return FAISSKMeans(centroids, labels), labels
-        
-    except Exception as e:
-        # Fallback to sklearn if FAISS fails
-        logger.warning(f"FAISS clustering failed ({e}), falling back to sklearn")
-        return _run_kmeans_sklearn(embeddings, n_clusters, seed)
-